diff --git a/modules/compiler/targets/host/include/host/host_pass_machinery.h b/modules/compiler/targets/host/include/host/host_pass_machinery.h index cad607cf4f..40f0f0e40d 100644 --- a/modules/compiler/targets/host/include/host/host_pass_machinery.h +++ b/modules/compiler/targets/host/include/host/host_pass_machinery.h @@ -21,14 +21,22 @@ #define HOST_PASSES_MACHINERY_H_INCLUDED #include - +#include #include namespace llvm { class TargetMachine; } +namespace vecz { + class VeczPassOptions; +} namespace host { +struct OptimizationOptions { + llvm::SmallVector vecz_pass_opts; + bool force_no_tail = false; + bool early_link_builtins = false; +}; class HostPassMachinery final : public compiler::BaseModulePassMachinery { public: @@ -63,6 +71,10 @@ class HostPassMachinery final : public compiler::BaseModulePassMachinery { /// @brief Returns an optimization pass pipeline correponding to /// BaseModule::getLateTargetPasses. llvm::ModulePassManager getLateTargetPasses(); + + static host::OptimizationOptions processOptimizationOptions( + std::optional env_debug_prefix, + std::optional vecz_mode); }; } // namespace host diff --git a/modules/compiler/targets/host/source/HostPassMachinery.cpp b/modules/compiler/targets/host/source/HostPassMachinery.cpp index 665c743493..a550f93e1c 100644 --- a/modules/compiler/targets/host/source/HostPassMachinery.cpp +++ b/modules/compiler/targets/host/source/HostPassMachinery.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -55,6 +56,99 @@ namespace host { +// Process various compiler options based off compiler build options and common +// environment variables + host::OptimizationOptions +HostPassMachinery::processOptimizationOptions( + std::optional env_debug_prefix, + std::optional vecz_mode) { + OptimizationOptions env_var_opts; + vecz::VeczPassOptions vecz_opts; + // The minimum number of elements to vectorize for. For a fixed-length VF, + // this is the exact number of elements to vectorize by. For scalable VFs, + // the actual number of elements is a multiple (vscale) of these, unknown at + // compile time. Default taken from config. May be overriden later. + vecz_opts.factor = compiler::utils::VectorizationFactor::getScalar(); + + vecz_opts.choices.enable(vecz::VectorizationChoices::eDivisionExceptions); + + vecz_opts.vecz_auto = vecz_mode == compiler::VectorizationMode::AUTO; + vecz_opts.vec_dim_idx = 0; + + // This is of the form of a comma separated set of fields + // S - use scalable vectorization + // V - vectorize only, otherwise produce both scalar and vector kernels + // A - let vecz automatically choose the vectorization factor + // 1-64 - vectorization factor multiplier: the fixed amount itself, or the + // value that multiplies the scalable amount + // VP - produce a vector-predicated kernel + // VVP - produce both a vectorized and a vector-predicated kernel + bool add_vvp = false; + if (const auto *vecz_vf_flags_env = std::getenv("CA_HOST_VF")) { + // Set scalable to off and let users add it explicitly with 'S'. + vecz_opts.factor.setIsScalable(false); + llvm::SmallVector flags; + const llvm::StringRef vf_flags_ref(vecz_vf_flags_env); + vf_flags_ref.split(flags, ','); + for (auto r : flags) { + if (r == "A" || r == "a") { + vecz_opts.vecz_auto = true; + } else if (r == "V" || r == "v") { + // Note: This is a legacy toggle for forcing vectorization with no + // scalar tail based on the "VF" environment variable. Ideally we'd be + // setting it on a per-function basis, and we'd also be setting the + // vectorization options themselves on a per-function basis. Until we've + // designed a new method, keep the legacy behaviour by re-parsing the + // "VF" environment variable and look for a "v/V" toggle. + env_var_opts.force_no_tail = true; + } else if (r == "S" || r == "s") { + vecz_opts.factor.setIsScalable(true); + env_var_opts.early_link_builtins = true; + } else if (isdigit(r[0])) { + vecz_opts.factor.setKnownMin(std::stoi(r.str())); + } else if (r == "VP" || r == "vp") { + vecz_opts.choices.enable( + vecz::VectorizationChoices::eVectorPredication); + } else if (r == "VVP" || r == "vvp") { + // Add the vectorized pass option now (controlled by other iterations + // of this loop), and flag that we have to add a vector-predicated form + // later. + add_vvp = true; + } else { + // An error - just stop processing the environment variable now. + break; + } + } + } + + // Choices override the cost model + const char *ptr = std::getenv("CODEPLAY_VECZ_CHOICES"); + if (ptr) { + const bool success = vecz_opts.choices.parseChoicesString(ptr); + if (!success) { + llvm::errs() << "failed to parse the CODEPLAY_VECZ_CHOICES variable\n"; + } + } + + env_var_opts.vecz_pass_opts.push_back(vecz_opts); + if (add_vvp) { + vecz_opts.choices.enable(vecz::VectorizationChoices::eVectorPredication); + env_var_opts.vecz_pass_opts.push_back(vecz_opts); + } + + // Allow any decisions made on early linking builtins to be overridden + // with an env variable + if (env_debug_prefix) { + const std::string env_name = *env_debug_prefix + "_EARLY_LINK_BUILTINS"; + if (const char *early_link_builtins_env = getenv(env_name.c_str())) { + env_var_opts.early_link_builtins = atoi(early_link_builtins_env) != 0; + } + } + + return env_var_opts; +} + + static bool hostVeczPassOpts( llvm::Function &F, llvm::ModuleAnalysisManager &MAM, llvm::SmallVectorImpl &Opts) { @@ -117,7 +211,16 @@ static bool hostVeczPassOpts( vecz_options.factor = compiler::utils::VectorizationFactor::getFixedWidth(SIMDWidth); - Opts.push_back(vecz_options); + if (getenv("CA_HOST_VF")) { + auto env_var_opts = HostPassMachinery::processOptimizationOptions( + /*env_debug_prefix*/ {}, vecz_mode); + if (env_var_opts.vecz_pass_opts.empty()) { + return false; + } + Opts.assign(env_var_opts.vecz_pass_opts); + } else { + Opts.push_back(vecz_options); + } return true; } @@ -178,6 +281,7 @@ void HostPassMachinery::registerPassCallbacks() { bool HostPassMachinery::handlePipelineElement(llvm::StringRef Name, llvm::ModulePassManager &PM) { + if (Name.consume_front("host-late-passes")) { PM.addPass(getLateTargetPasses()); return true; @@ -238,6 +342,9 @@ llvm::ModulePassManager HostPassMachinery::getKernelFinalizationPasses( llvm::ModulePassManager PM; const compiler::BasePassPipelineTuner tuner(options); + auto env_var_opts = + processOptimizationOptions("CA_HOST", /* vecz_mode*/ {}); + // Forcibly compute the BuiltinInfoAnalysis so that cached retrievals work. PM.addPass(llvm::RequireAnalysisPass()); @@ -246,6 +353,10 @@ llvm::ModulePassManager HostPassMachinery::getKernelFinalizationPasses( PM.addPass(llvm::createModuleToFunctionPassAdaptor( compiler::utils::ReplaceAddressSpaceQualifierFunctionsPass())); + if (env_var_opts.early_link_builtins) { + PM.addPass(compiler::utils::LinkBuiltinsPass()); + } + addPreVeczPasses(PM, tuner); PM.addPass(vecz::RunVeczPass()); diff --git a/modules/compiler/targets/host/source/info.cpp b/modules/compiler/targets/host/source/info.cpp index f51b88735e..27ee6e6da1 100644 --- a/modules/compiler/targets/host/source/info.cpp +++ b/modules/compiler/targets/host/source/info.cpp @@ -101,7 +101,7 @@ HostInfo::HostInfo(host::arch arch, host::os os, vectorizable = true; dma_optimizable = true; - scalable_vector_support = false; + scalable_vector_support = getenv("CA_HOST_VF") ? true : false; kernel_debug = true; #ifdef CA_ENABLE_DEBUG_SUPPORT // Dummy values for testing. Enabled only on debug enabled builds with a diff --git a/modules/compiler/targets/host/source/kernel.cpp b/modules/compiler/targets/host/source/kernel.cpp index cb18017388..484387866f 100644 --- a/modules/compiler/targets/host/source/kernel.cpp +++ b/modules/compiler/targets/host/source/kernel.cpp @@ -300,11 +300,11 @@ HostKernel::lookupOrCreateOptimizedKernel(std::array local_size) { } // Host doesn't support scalable values. - if (fn_metadata.min_work_item_factor.isScalable() || - fn_metadata.pref_work_item_factor.isScalable() || - fn_metadata.sub_group_size.isScalable()) { - return cargo::make_unexpected(compiler::Result::FINALIZE_PROGRAM_FAILURE); - } + // if (fn_metadata.min_work_item_factor.isScalable() || + // fn_metadata.pref_work_item_factor.isScalable() || + // fn_metadata.sub_group_size.isScalable()) { + // return cargo::make_unexpected(compiler::Result::FINALIZE_PROGRAM_FAILURE); + // } // Note that we grab a handle to the module here, which we use to reference // the module going forward. This is despite us passing ownership of the diff --git a/modules/mux/targets/host/source/metadata_hooks.cpp b/modules/mux/targets/host/source/metadata_hooks.cpp index fca30c959f..0e1f438766 100644 --- a/modules/mux/targets/host/source/metadata_hooks.cpp +++ b/modules/mux/targets/host/source/metadata_hooks.cpp @@ -78,17 +78,21 @@ cargo::optional readBinaryMetadata(loader::ElfFile *elf, handler::VectorizeInfoMetadata md; while (handler.read(md)) { // We don't expect scalable vectorization widths on host. + bool isScalable = false; if (md.min_work_item_factor.isScalable() || md.pref_work_item_factor.isScalable()) { - return cargo::nullopt; + printf("Warning: Scalable support is experimental on host target\n"); + // return cargo::nullopt; + isScalable = true; } const host::binary_kernel_s kernel{ /*hook*/ 0, std::move(md.kernel_name), + // TODO: Work out sensible values for scalable. static_cast(md.local_memory_usage), - md.min_work_item_factor.getFixedValue(), - md.pref_work_item_factor.getFixedValue(), - md.sub_group_size.getFixedValue()}; + isScalable ? 1 : md.min_work_item_factor.getFixedValue(), + isScalable ? 1 : md.pref_work_item_factor.getFixedValue(), + isScalable ? 1 : md.sub_group_size.getFixedValue()}; auto it = kernels.find(md.source_name); if (it != kernels.end()) { it->second.push_back(kernel); diff --git a/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake b/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake index 032883b42b..b1a7e75dc8 100644 --- a/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake +++ b/source/cl/test/UnitCL/cmake/CompileKernelToBin.cmake @@ -222,6 +222,7 @@ endif() # ${CLC_EXECUTABLE} may have other things in it (like a qemu invocation). Turn # it into a CMake list, so that execute_process() isn't confused. string(REPLACE " " ";" CLC_EXECUTABLE "${CLC_EXECUTABLE}") +# message("__CSD__ ${CLC_EXECUTABLE} -d ${DEVICE_NAME} -cl-kernel-arg-info -cl-std=CL${CLC_CL_STD} ${CLC_OPTIONS_LIST} ${DEFS_LIST} -o ${OUTPUT_FILE} -- ${INPUT_FILE}") execute_process( COMMAND ${CLC_EXECUTABLE} -d ${DEVICE_NAME} diff --git a/source/cl/test/UnitCL/kernels/task_01.02_add.bin b/source/cl/test/UnitCL/kernels/task_01.02_add.bin new file mode 100644 index 0000000000..db338fa5de Binary files /dev/null and b/source/cl/test/UnitCL/kernels/task_01.02_add.bin differ