diff --git a/CMakeGTEST.txt.in b/CMakeGTEST.txt.in index 3272b53c..3fb6e832 100644 --- a/CMakeGTEST.txt.in +++ b/CMakeGTEST.txt.in @@ -5,7 +5,7 @@ project(googletest-download NONE) include(ExternalProject) ExternalProject_Add(googletest GIT_REPOSITORY https://github.com/google/googletest.git - GIT_TAG master + GIT_TAG release-1.11.0 SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src" BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build" CONFIGURE_COMMAND "" diff --git a/CMakeLists.txt b/CMakeLists.txt index 92684572..9516c50b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -9,6 +9,7 @@ set(SMI_REWRITER "${CMAKE_BINARY_DIR}/source-rewriter/rewriter") set(SMI_FMAX "480" CACHE STRING "Target Fmax when compiling for hardware.") set(SMI_DEVICES_PER_NODE 2 CACHE STRING "Number of FPGA devices per node.") +set(CL_HPP_TARGET_OPENCL_VERSION 200 CACHE STRING "CL Version") option (ENABLE_TESTS "Enables testing" OFF) # Dependencies @@ -106,13 +107,10 @@ function(smi_target TARGET_NAME CONNECTION_FILE HOST_SOURCE KERNELS NUM_RANKS) ) add_dependencies(${KERNEL_TARGET} rewriter) - # compile FPGA code - set(FPGA_SRC_FILES "${SMI_GENERATED_PATH};${KERNEL_GENERATED_PATH}") - # generate report set(FPGA_REPORT_TARGET ${TARGET_NAME}_${KERNEL_NAME}_aoc_report) add_custom_target(${FPGA_REPORT_TARGET} - COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${FPGA_SRC_FILES} -rtl -report + COMMAND ${IntelFPGAOpenCL_AOC} ${KERNEL_GENERATED_PATH} -I${KERNEL_BIN_DIR} ${AOC_COMMAND} -rtl -report WORKING_DIRECTORY ${KERNEL_BIN_DIR} ) add_dependencies(${FPGA_REPORT_TARGET} ${KERNEL_TARGET}) @@ -120,7 +118,7 @@ function(smi_target TARGET_NAME CONNECTION_FILE HOST_SOURCE KERNELS NUM_RANKS) # build hardware set(FPGA_BUILD_TARGET ${TARGET_NAME}_${KERNEL_NAME}_aoc_build) add_custom_target(${FPGA_BUILD_TARGET} - COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${FPGA_SRC_FILES} + COMMAND ${IntelFPGAOpenCL_AOC} ${KERNEL_GENERATED_PATH} -I${KERNEL_BIN_DIR} ${AOC_COMMAND} WORKING_DIRECTORY ${KERNEL_BIN_DIR} ) add_dependencies(${FPGA_BUILD_TARGET} ${KERNEL_TARGET}) @@ -179,15 +177,13 @@ function(smi_target TARGET_NAME CONNECTION_FILE HOST_SOURCE KERNELS NUM_RANKS) list(GET FPGA_SOURCES ${KERNEL_INDEX} USER_DEVICE_SRC) list(GET FPGA_GENERATED_SOURCES ${KERNEL_INDEX} SMI_DEVICE_SRC) - set(FPGA_SRC_FILES "${SMI_DEVICE_SRC};${USER_DEVICE_SRC}") set(EMULATION_WORKDIR ${WORKDIR}/emulator_${SMI_EMULATION_RANK}) file(MAKE_DIRECTORY ${EMULATION_WORKDIR}) set(EMULATOR_TARGET_RANK ${EMULATOR_TARGET}_${SMI_EMULATION_RANK}) add_custom_target(${EMULATOR_TARGET_RANK} - COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${FPGA_SRC_FILES} -march=emulator + COMMAND ${IntelFPGAOpenCL_AOC} ${USER_DEVICE_SRC} -I${KERNEL_BIN_DIR} ${AOC_COMMAND} -march=emulator -DSMI_EMULATION_RANK=${SMI_EMULATION_RANK} - -emulator-channel-depth-model=strict WORKING_DIRECTORY ${EMULATION_WORKDIR} ) add_dependencies(${EMULATOR_TARGET} ${EMULATOR_TARGET_RANK}) @@ -245,7 +241,7 @@ function(fpga_target TARGET_NAME HOST_SOURCE KERNEL GENERATE_KERNEL) COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${KERNEL} WORKING_DIRECTORY ${KERNEL_BIN_DIR} ) - if(USE_CODEGEN) + if(GENERATE_KERNEL) add_dependencies(${FPGA_BUILD_TARGET} generate_${KERNEL_NAME}) endif() @@ -267,7 +263,7 @@ function(fpga_target TARGET_NAME HOST_SOURCE KERNEL GENERATE_KERNEL) -emulator-channel-depth-model=strict WORKING_DIRECTORY ${EMULATION_WORKDIR} ) - if(USE_CODEGEN) + if(GENERATE_KERNEL) add_dependencies(${EMULATOR_TARGET} generate_${KERNEL_NAME}) endif() endfunction() diff --git a/README.md b/README.md index 93714ea1..32d8e909 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,7 @@ make stencil_smi_emulator -j make stencil_smi_host cd stencil_smi # Execute the program -env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./stencil_smi_host emulator +env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./stencil_smi_host emulator ``` To generate the report, from the `examples` directory in the CMake folder, the user must execute: diff --git a/codegen/templates/host.cl b/codegen/templates/host.cl index 35b19e7c..83a4b7da 100644 --- a/codegen/templates/host.cl +++ b/codegen/templates/host.cl @@ -104,7 +104,7 @@ SMI_Comm SmiInit_{{ name }}( const int num_kernels = kernel_names.size(); for (int i = num_kernels - 1; i >= 0; i--) { - queues[i].enqueueTask(kernels[i]); + queues[i].enqueueNDRangeKernel(kernels[i], cl::NullRange, cl::NDRange(1)); queues[i].flush(); } diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index f1ae8412..57ec6f36 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -37,6 +37,7 @@ if(PythonInterp_FOUND) #onchip versions fpga_target(gesummv_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/gesummv_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_onchip.cl" OFF) + target_link_libraries(gesummv_onchip_host openblas) fpga_target(stencil_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/stencil_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_onchip.cl" ON) endif() diff --git a/examples/host/gesummv_onchip.cpp b/examples/host/gesummv_onchip.cpp index f8bde349..b1dd33d4 100644 --- a/examples/host/gesummv_onchip.cpp +++ b/examples/host/gesummv_onchip.cpp @@ -15,6 +15,11 @@ #include #define TILE_SIZE 128 //define this as used in the opencl kernel +#if !defined(CL_CHANNEL_1_INTELFPGA) +// include this header if channel macros are not defined in cl.hpp (versions >=19.0) +#include "CL/cl_ext_intelfpga.h" +#endif + using namespace std; float *A,*B,*x,*y; float *fpga_res_y; @@ -150,7 +155,7 @@ void testStreamed(std::string program_path,int n, int m, float alpha, float beta comp_start=current_time_usecs(); asm volatile("": : :"memory"); for(int i=0;i=19.0) +#include "CL/cl_ext_intelfpga.h" +#endif + using namespace std; float *A,*B,*x,*y; float *fpga_res_y; @@ -99,7 +104,7 @@ int main(int argc, char *argv[]) case 'r': runs=atoi(optarg); break; - case'k': + case 'k': { rank=atoi(optarg); if(rank!=0 && rank!=1) @@ -166,10 +171,15 @@ int main(int argc, char *argv[]) generate_float_matrix(B,n,m); - hlslib::ocl::Context context(fpga); - auto program = context.MakeProgram(program_path); + hlslib::ocl::Context *context; + if (emulator) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, fpga); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, fpga); + } + auto program = context->MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_gesummv_rank0(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_gesummv_rank0(rank, rank_count, ROUTING_DIR, *context, program, buffers); int tile_size=128; @@ -182,10 +192,10 @@ int main(int argc, char *argv[]) // Create device buffers size_t elem_per_module=n*m/2; - hlslib::ocl::Buffer input_x = context.MakeBuffer(hlslib::ocl::MemoryBank::bank2, m); - hlslib::ocl::Buffer output_y = context.MakeBuffer(hlslib::ocl::MemoryBank::bank3, n); - hlslib::ocl::Buffer input_M_0 = context.MakeBuffer(hlslib::ocl::MemoryBank::bank0, elem_per_module); - hlslib::ocl::Buffer input_M_1 = context.MakeBuffer(hlslib::ocl::MemoryBank::bank1, elem_per_module); + hlslib::ocl::Buffer input_x = context->MakeBuffer(hlslib::ocl::MemoryBank::bank2, m); + hlslib::ocl::Buffer output_y = context->MakeBuffer(hlslib::ocl::MemoryBank::bank3, n); + hlslib::ocl::Buffer input_M_0 = context->MakeBuffer(hlslib::ocl::MemoryBank::bank0, elem_per_module); + hlslib::ocl::Buffer input_M_1 = context->MakeBuffer(hlslib::ocl::MemoryBank::bank1, elem_per_module); // Create kernels diff --git a/examples/host/kmeans_smi.cpp b/examples/host/kmeans_smi.cpp index 70b1f61b..e291fe18 100644 --- a/examples/host/kmeans_smi.cpp +++ b/examples/host/kmeans_smi.cpp @@ -5,6 +5,7 @@ #include "hlslib/intel/OpenCL.h" #include "kmeans.h" #include "common.h" +#include #define __HOST_PROGRAM__ #include @@ -56,7 +57,7 @@ int main(int argc, char **argv) { std::string mode_str(argv[1]); std::string kernel_path; if (mode_str == "emulator") { - setenv("CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA", "1", false); + setenv("CL_CONFIG_CPU_EMULATE_DEVICES", "1", false); emulator = true; // In emulation mode, each rank has its own kernel file kernel_path = @@ -167,17 +168,22 @@ int main(int argc, char **argv) { try { - MPIStatus(mpi_rank, "Creating OpenCL context...\n"); - hlslib::ocl::Context context(emulator ? 0 : (mpi_rank % kDevicesPerNode)); + MPIStatus(mpi_rank, "Creating OpenCL conext...\n"); + hlslib::ocl::Context *context; + if (emulator) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, (mpi_rank % kDevicesPerNode)); + } MPIStatus(mpi_rank, "Allocating and copying device memory...\n"); - auto points_device = context.MakeBuffer( + auto points_device = context->MakeBuffer( points.cbegin(), points.cend()); auto centroids_device_read = - context.MakeBuffer(centroids.cbegin(), + context->MakeBuffer(centroids.cbegin(), centroids.cend()); auto centroids_device_write = - context.MakeBuffer(centroids.cbegin(), + context->MakeBuffer(centroids.cbegin(), centroids.cend()); std::vector> routing_tables_cks_device(kChannelsPerRank); @@ -185,15 +191,15 @@ int main(int argc, char **argv) { routing_tables_ckr_device(kChannelsPerRank); for (int i = 0; i < kChannelsPerRank; ++i) { routing_tables_cks_device[i] = - context.MakeBuffer( + context->MakeBuffer( routing_tables_cks[i].cbegin(), routing_tables_cks[i].cend()); routing_tables_ckr_device[i] = - context.MakeBuffer( + context->MakeBuffer( routing_tables_ckr[i].cbegin(), routing_tables_ckr[i].cend()); } MPIStatus(mpi_rank, "Creating program from binary...\n"); - auto program = context.MakeProgram(kernel_path); + auto program = context->MakeProgram(kernel_path); MPIStatus(mpi_rank, "Starting communication kernels...\n"); std::vector comm_kernels; @@ -251,7 +257,7 @@ int main(int argc, char **argv) { //for (auto &k : kernels) { for(int i=0;i<3;i++){ - //futures.emplace_back(k.ExecuteTaskAsync()); //HLSLIB + //futures.emplace_back(k.ExecuteTaskFork()); //HLSLIB cl::CommandQueue queue=kernels[i].commandQueue(); queue.enqueueTask(kernels[i].kernel(),nullptr, &events[i]); //queue.flush(); @@ -264,7 +270,7 @@ int main(int argc, char **argv) { }*/ //for (auto &k : kernels) { for(int i=0;i<3;i++){ - //futures.emplace_back(k.ExecuteTaskAsync()); HLSLIB + //futures.emplace_back(k.ExecuteTaskFork()); HLSLIB //cl::CommandQueue queue=k.commandQueue(); //queue.finish(); events[i].wait(); diff --git a/examples/host/stencil_onchip.cpp b/examples/host/stencil_onchip.cpp index 5dd75fa1..a4cf21b3 100644 --- a/examples/host/stencil_onchip.cpp +++ b/examples/host/stencil_onchip.cpp @@ -4,6 +4,7 @@ #include #include "hlslib/intel/OpenCL.h" #include "stencil.h" +#include // Convert from C to C++ using Data_t = DTYPE; @@ -78,9 +79,10 @@ int main(int argc, char **argv) { std::string kernel_path; if (mode_str == "emulator") { emulator = true; - kernel_path = "stencil_spatial_tiling_emulator.aocx"; + kernel_path = "emulator/stencil_onchip.aocx"; } else if (mode_str == "hardware") { - kernel_path = "stencil_spatial_tiling_hardware.aocx"; + // TODO: find the right path + kernel_path = "stencil_onchip.aocx"; emulator = false; } else { std::cout << kUsage; @@ -105,10 +107,15 @@ int main(int argc, char **argv) { // Create OpenCL kernels std::cout << "Creating OpenCL context...\n" << std::flush; - hlslib::ocl::Context context; + hlslib::ocl::Context *context; + if (emulator) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } std::cout << "Allocating device memory...\n" << std::flush; std::cout << "Creating program from binary...\n" << std::flush; - auto program = context.MakeProgram(kernel_path); + auto program = context->MakeProgram(kernel_path); std::cout << "Creating kernels...\n" << std::flush; std::vector kernels; std::vector> @@ -119,7 +126,7 @@ int main(int argc, char **argv) { for (int i = 0; i < kPX; ++i) { for (int j = 0; j < kPY; ++j) { auto device_buffer = - context.MakeBuffer( + context->MakeBuffer( banks[(i * kPY + j) % banks.size()], 2 * kXLocal * kYLocal); const std::string suffix("_" + std::to_string(i) + "_" + std::to_string(j)); diff --git a/examples/host/stencil_smi.cpp b/examples/host/stencil_smi.cpp index 3971480c..6d6d2ead 100644 --- a/examples/host/stencil_smi.cpp +++ b/examples/host/stencil_smi.cpp @@ -8,6 +8,7 @@ #include "common.h" #define __HOST_PROGRAM__ #include "hlslib/intel/OpenCL.h" +#include #include "stencil.h" #include @@ -24,7 +25,7 @@ constexpr int kXLocal = kX / kPX; constexpr int kYLocal = kY / kPY; constexpr auto kDevicesPerNode = SMI_DEVICES_PER_NODE; constexpr auto kUsage = - "Usage: ./stencil_smi_interleaved <[emulator/hardware]> \n"; + "Usage: ./stencil_smi <[emulator/hardware]> \n"; using AlignedVec_t = std::vector>; @@ -143,10 +144,10 @@ int main(int argc, char **argv) { std::string mode_str(argv[1]); std::string kernel_path; if (mode_str == "emulator") { - setenv("CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA", "1", false); + setenv("CL_CONFIG_CPU_EMULATE_DEVICES", "1", false); emulator = true; // In emulation mode, each rank has its own kernel file - kernel_path = ("emulator_" + std::to_string(mpi_rank) + "/stencil_smi_interleaved.aocx"); + kernel_path = ("emulator_" + std::to_string(mpi_rank) + "/stencil_smi.aocx"); } else if (mode_str == "hardware") { kernel_path = "stencil_smi/stencil_smi.aocx"; emulator = false; @@ -205,9 +206,14 @@ int main(int argc, char **argv) { MPIStatus(mpi_rank, "Creating OpenCL context...\n"); try { - hlslib::ocl::Context context(emulator ? 0 : (mpi_rank % kDevicesPerNode)); + hlslib::ocl::Context *context; + if (emulator) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, (mpi_rank % kDevicesPerNode)); + } MPIStatus(mpi_rank, "Creating program from binary...\n"); - auto program = context.MakeProgram(kernel_path); + auto program = context->MakeProgram(kernel_path); MPI_Barrier(MPI_COMM_WORLD); @@ -219,7 +225,7 @@ int main(int argc, char **argv) { device_buffers; for (int b = 0; b < kMemoryBanks; ++b) { auto device_buffer = - context.MakeBuffer( + context->MakeBuffer( banks[b % banks.size()], 2 * kXLocal * kYLocal / kMemoryBanks); device_buffer.CopyFromHost(0, kXLocal * kYLocal / kMemoryBanks, interleaved_host[b].cbegin()); @@ -234,10 +240,10 @@ int main(int argc, char **argv) { routing_tables_ckr_device(kChannelsPerRank); for (int i = 0; i < kChannelsPerRank; ++i) { routing_tables_cks_device[i] = - context.MakeBuffer( + context->MakeBuffer( routing_tables_cks[i].cbegin(), routing_tables_cks[i].cend()); routing_tables_ckr_device[i] = - context.MakeBuffer( + context->MakeBuffer( routing_tables_ckr[i].cbegin(), routing_tables_ckr[i].cend()); } diff --git a/examples/kernels/gesummv_rank0.cl b/examples/kernels/gesummv_rank0.cl index 199424a0..2487f0e1 100644 --- a/examples/kernels/gesummv_rank0.cl +++ b/examples/kernels/gesummv_rank0.cl @@ -14,6 +14,7 @@ */ +#include "smi_generated_device.cl" #pragma OPENCL EXTENSION cl_intel_channels : enable diff --git a/examples/kernels/gesummv_rank1.cl b/examples/kernels/gesummv_rank1.cl index 7575c39b..87bb9c52 100644 --- a/examples/kernels/gesummv_rank1.cl +++ b/examples/kernels/gesummv_rank1.cl @@ -14,6 +14,7 @@ */ +#include "smi_generated_device.cl" #pragma OPENCL EXTENSION cl_intel_channels : enable diff --git a/examples/kernels/kmeans_smi.cl b/examples/kernels/kmeans_smi.cl index fef9c812..63ebf650 100644 --- a/examples/kernels/kmeans_smi.cl +++ b/examples/kernels/kmeans_smi.cl @@ -3,6 +3,7 @@ */ #include +#include "smi_generated_device.cl" #include "kmeans.h" // Every channel is set to twice the size it needs to be diff --git a/examples/kernels/stencil_smi.cl b/examples/kernels/stencil_smi.cl index c3e126e4..a0caed06 100644 --- a/examples/kernels/stencil_smi.cl +++ b/examples/kernels/stencil_smi.cl @@ -1,5 +1,6 @@ #include +#include "smi_generated_device.cl" #include "stencil.h" #if PX * PY != 8 diff --git a/hlslib b/hlslib index bb51b208..6cbb1ffd 160000 --- a/hlslib +++ b/hlslib @@ -1 +1 @@ -Subproject commit bb51b208e7e64cd2cc85a3adf5819c43cfce9a6c +Subproject commit 6cbb1ffd5eb212499f8a2b4d3ea2274c9e8cd88b diff --git a/include/utils/ocl_utils.hpp b/include/utils/ocl_utils.hpp index 944d2029..c59b7766 100644 --- a/include/utils/ocl_utils.hpp +++ b/include/utils/ocl_utils.hpp @@ -15,12 +15,12 @@ #include #include #include +#include -#include "CL/cl.hpp" -#if !defined(CL_CHANNEL_1_INTELFPGA) -// include this header if channel macros are not defined in cl.hpp (versions >=19.0) -#include "CL/cl_ext_intelfpga.h" -#endif +#include "CL/cl2.hpp" + +#define VENDOR_STRING "Intel(R) FPGA SDK for OpenCL(TM)" +#define VENDOR_STRING_EMULATION "Intel(R) FPGA Emulation Platform for OpenCL(TM)" /** * @brief The IntelFPGAOCLUtils class contains a set of basic utilities for interacting @@ -39,14 +39,21 @@ class IntelFPGAOCLUtils{ { cl_int status; - bool found=findPlatform("Intel(R) FPGA SDK for OpenCL(TM)",platform); + bool found=findPlatform(VENDOR_STRING,platform); + bool emulation = false; if(!found) { - std::cerr<< "ERROR: Unable to find Intel(R) FPGA OpenCL platform" < devices; - status=platform.getDevices(CL_DEVICE_TYPE_ACCELERATOR,&devices); + status=platform.getDevices(CL_DEVICE_TYPE_ALL,&devices); checkError(status, __FILE__,__LINE__, "Query for device failed"); if(device_id>devices.size()) checkError(status, __FILE__,__LINE__, "Device id not present"); @@ -55,7 +62,7 @@ class IntelFPGAOCLUtils{ // Create the context context=cl::Context({device}); //create the program - createProgramFromBinary(context, program,program_path.c_str(),device); + createProgramFromBinary(context, program,program_path.c_str(),device, emulation); return true; } @@ -198,7 +205,7 @@ class IntelFPGAOCLUtils{ /** * @brief createProgramFromBinary loads the program froma binary file */ - static void createProgramFromBinary(cl::Context &context, cl::Program &program, const char *binary_file_name, cl::Device &device) { + static void createProgramFromBinary(cl::Context &context, cl::Program &program, const char *binary_file_name, cl::Device &device, bool build) { // Early exit for potentially the most common way to fail: AOCX does not exist. if(!fileExists(binary_file_name)) { std::cerr<< "AOCX file '"< buf = loadBinaryFile(binary_file_name); + //create the vector with the binaries to pass to the constructor of cl::Program + cl::Program::Binaries binaries{buf}; std::vector dev; dev.push_back(device); - //create the vector with the binaries to pass to the constructor of cl::Program - std::vector> binaries; - binaries.push_back(std::make_pair(binary,binary_size)); - std::vector status(1); - program=cl::Program(context,{device},binaries,&status); + cl_int status; + program=cl::Program(context,dev,binaries,NULL,&status); + checkError(status, __FILE__,__LINE__, "Failed to create program with binary"); - checkError(status[0], __FILE__,__LINE__, "Failed to create program with binary"); + if (build) { + // build Program to support CPU Emulation + checkError(program.build(), __FILE__,__LINE__, "Failed to build program"); + } } static bool fileExists(const char *file_name) { @@ -228,33 +234,21 @@ class IntelFPGAOCLUtils{ } // Loads a file in binary form. - static unsigned char *loadBinaryFile(const char *file_name, size_t *size) { + static std::vector loadBinaryFile(const char *file_name) { // Open the File - FILE* fp; - fp = fopen(file_name, "rb"); - if(fp == 0) { - return NULL; - } - - // Get the size of the file - fseek(fp, 0, SEEK_END); - *size = ftell(fp); - - // Allocate space for the binary - unsigned char *binary = new unsigned char[*size]; - - // Go back to the file start - rewind(fp); - - // Read the file into the binary - if(fread((void*)binary, *size, 1, fp) == 0) { - delete[] binary; - fclose(fp); - return NULL; - } - fclose(fp); - - return binary; + std::ifstream aocxStream(file_name, std::ifstream::binary); + if (!aocxStream.is_open()) { + checkError(CL_INVALID_PROGRAM,__FILE__,__LINE__, "Failed to load binary file"); + } + + // Read in file contents and create program from binaries + aocxStream.seekg(0, aocxStream.end); + long file_size = aocxStream.tellg(); + aocxStream.seekg(0, aocxStream.beg); + std::vector buf(file_size); + aocxStream.read(reinterpret_cast(buf.data()), file_size); + + return buf; } /** diff --git a/microbenchmarks/host/bandwidth_benchmark.cpp b/microbenchmarks/host/bandwidth_benchmark.cpp index 4186ef02..23ca54ba 100644 --- a/microbenchmarks/host/bandwidth_benchmark.cpp +++ b/microbenchmarks/host/bandwidth_benchmark.cpp @@ -120,14 +120,19 @@ int main(int argc, char *argv[]) gethostname(hostname, HOST_NAME_MAX); std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_bandwidth_0(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_bandwidth_0(rank, rank_count, ROUTING_DIR, *context, program, buffers); // Create device buffers - hlslib::ocl::Buffer check_0 = context.MakeBuffer(1); - hlslib::ocl::Buffer check_1 = context.MakeBuffer(1); + hlslib::ocl::Buffer check_0 = context->MakeBuffer(1); + hlslib::ocl::Buffer check_1 = context->MakeBuffer(1); // Create kernel char dest=(char)recv_rank; diff --git a/microbenchmarks/host/broadcast_benchmark.cpp b/microbenchmarks/host/broadcast_benchmark.cpp index 9b549ab0..63dff712 100644 --- a/microbenchmarks/host/broadcast_benchmark.cpp +++ b/microbenchmarks/host/broadcast_benchmark.cpp @@ -88,13 +88,18 @@ int main(int argc, char *argv[]) gethostname(hostname, HOST_NAME_MAX); std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_broadcast(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_broadcast(rank, rank_count, ROUTING_DIR, *context, program, buffers); // Create device buffer - hlslib::ocl::Buffer check = context.MakeBuffer(1); + hlslib::ocl::Buffer check = context->MakeBuffer(1); // Create kernel hlslib::ocl::Kernel kernel = program.MakeKernel("app", check, n, root, comm); diff --git a/microbenchmarks/host/gather_benchmark.cpp b/microbenchmarks/host/gather_benchmark.cpp index 65eeb94c..db93a5ad 100644 --- a/microbenchmarks/host/gather_benchmark.cpp +++ b/microbenchmarks/host/gather_benchmark.cpp @@ -88,13 +88,18 @@ int main(int argc, char *argv[]) std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_gather(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_gather(rank, rank_count, ROUTING_DIR, *context, program, buffers); // Create device buffer - hlslib::ocl::Buffer check = context.MakeBuffer(1); + hlslib::ocl::Buffer check = context->MakeBuffer(1); // Create kernel hlslib::ocl::Kernel kernel = program.MakeKernel("app", n, root, check, comm); diff --git a/microbenchmarks/host/injection_rate_benchmark.cpp b/microbenchmarks/host/injection_rate_benchmark.cpp index 615c52e8..7408273a 100644 --- a/microbenchmarks/host/injection_rate_benchmark.cpp +++ b/microbenchmarks/host/injection_rate_benchmark.cpp @@ -123,10 +123,15 @@ int main(int argc, char *argv[]) gethostname(hostname, 256); std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_injection_rate_0(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_injection_rate_0(rank, rank_count, ROUTING_DIR, *context, program, buffers); //create kernel hlslib::ocl::Kernel app = (rank==0)?program.MakeKernel("app", n, recv_rank, comm) : program.MakeKernel("app", n,comm) ; diff --git a/microbenchmarks/host/latency_benchmark.cpp b/microbenchmarks/host/latency_benchmark.cpp index 7ed2da08..a0a51a91 100644 --- a/microbenchmarks/host/latency_benchmark.cpp +++ b/microbenchmarks/host/latency_benchmark.cpp @@ -127,10 +127,15 @@ int main(int argc, char *argv[]) gethostname(hostname, HOST_NAME_MAX); std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_latency_0(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_latency_0(rank, rank_count, ROUTING_DIR, *context, program, buffers); //create kernel hlslib::ocl::Kernel app = (rank==0)?program.MakeKernel("app", n, recv_rank, comm) : program.MakeKernel("app", n, comm) ; diff --git a/microbenchmarks/host/multi_collectives_benchmark.cpp b/microbenchmarks/host/multi_collectives_benchmark.cpp index 98b5c2d6..13538cdd 100644 --- a/microbenchmarks/host/multi_collectives_benchmark.cpp +++ b/microbenchmarks/host/multi_collectives_benchmark.cpp @@ -88,15 +88,20 @@ int main(int argc, char *argv[]) gethostname(hostname, HOST_NAME_MAX); std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_multi_collectives(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_multi_collectives(rank, rank_count, ROUTING_DIR, *context, program, buffers); /*---------------------------------------------- * Sequential collectives * ----------------------------------------------*/ - hlslib::ocl::Buffer check = context.MakeBuffer(1); + hlslib::ocl::Buffer check = context->MakeBuffer(1); hlslib::ocl::Kernel kernel_sequential = program.MakeKernel("sequential_collectives", n, root, check, comm); std::vector times_sequential; diff --git a/microbenchmarks/host/reduce_benchmark.cpp b/microbenchmarks/host/reduce_benchmark.cpp index 398f2ebf..b4ed4e58 100644 --- a/microbenchmarks/host/reduce_benchmark.cpp +++ b/microbenchmarks/host/reduce_benchmark.cpp @@ -84,13 +84,18 @@ int main(int argc, char *argv[]) gethostname(hostname, HOST_NAME_MAX); std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm=SmiInit_reduce(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm=SmiInit_reduce(rank, rank_count, ROUTING_DIR, *context, program, buffers); // Create device buffer - hlslib::ocl::Buffer check = context.MakeBuffer(1); + hlslib::ocl::Buffer check = context->MakeBuffer(1); // Create kernel hlslib::ocl::Kernel kernel = program.MakeKernel("app", n, root, check, comm); diff --git a/microbenchmarks/host/scatter_benchmark.cpp b/microbenchmarks/host/scatter_benchmark.cpp index 09557c23..e6c439b6 100644 --- a/microbenchmarks/host/scatter_benchmark.cpp +++ b/microbenchmarks/host/scatter_benchmark.cpp @@ -89,13 +89,18 @@ int main(int argc, char *argv[]) std::cout << "Rank" << rank<<" executing on host:" <MakeProgram(program_path); std::vector> buffers; - SMI_Comm comm= SmiInit_scatter(rank, rank_count, ROUTING_DIR, context, program, buffers); + SMI_Comm comm= SmiInit_scatter(rank, rank_count, ROUTING_DIR, *context, program, buffers); // Create device buffer - hlslib::ocl::Buffer check = context.MakeBuffer(1); + hlslib::ocl::Buffer check = context->MakeBuffer(1); // Create kernel hlslib::ocl::Kernel kernel = program.MakeKernel("app", n, root, check, comm); diff --git a/microbenchmarks/kernels/bandwidth_0.cl b/microbenchmarks/kernels/bandwidth_0.cl index 5ff790ca..d04aac66 100644 --- a/microbenchmarks/kernels/bandwidth_0.cl +++ b/microbenchmarks/kernels/bandwidth_0.cl @@ -8,6 +8,8 @@ #include +#include "smi_generated_device.cl" + __kernel void app(const int N, const char dest_rank, const SMI_Comm comm) { diff --git a/microbenchmarks/kernels/bandwidth_1.cl b/microbenchmarks/kernels/bandwidth_1.cl index 03c446c6..d3fbd33a 100644 --- a/microbenchmarks/kernels/bandwidth_1.cl +++ b/microbenchmarks/kernels/bandwidth_1.cl @@ -9,6 +9,8 @@ #include +#include "smi_generated_device.cl" + __kernel void app(__global char *mem, const int N, SMI_Comm comm) { SMI_Channel chan=SMI_Open_receive_channel_ad(N, SMI_DOUBLE, 0, 0, comm, 2048); diff --git a/microbenchmarks/kernels/broadcast.cl b/microbenchmarks/kernels/broadcast.cl index 7bc27e44..513d2e77 100644 --- a/microbenchmarks/kernels/broadcast.cl +++ b/microbenchmarks/kernels/broadcast.cl @@ -6,6 +6,8 @@ #pragma OPENCL EXTENSION cl_intel_channels : enable #include +#include "smi_generated_device.cl" + __kernel void app(__global char* mem, const int N, char root,SMI_Comm comm) { char check=1; diff --git a/microbenchmarks/kernels/gather.cl b/microbenchmarks/kernels/gather.cl index b002a6fa..9f91ba79 100644 --- a/microbenchmarks/kernels/gather.cl +++ b/microbenchmarks/kernels/gather.cl @@ -5,6 +5,7 @@ #include +#include "smi_generated_device.cl" __kernel void app(const int N, char root, __global char *mem, SMI_Comm comm) { diff --git a/microbenchmarks/kernels/injection_rate_0.cl b/microbenchmarks/kernels/injection_rate_0.cl index 24dfe52b..bc75bae7 100644 --- a/microbenchmarks/kernels/injection_rate_0.cl +++ b/microbenchmarks/kernels/injection_rate_0.cl @@ -7,7 +7,7 @@ #include - +#include "smi_generated_device.cl" __kernel void app(const int N, const char dst, SMI_Comm comm) { diff --git a/microbenchmarks/kernels/injection_rate_1.cl b/microbenchmarks/kernels/injection_rate_1.cl index c5e2cb9f..4f0a8295 100644 --- a/microbenchmarks/kernels/injection_rate_1.cl +++ b/microbenchmarks/kernels/injection_rate_1.cl @@ -7,6 +7,7 @@ #include +#include "smi_generated_device.cl" __kernel void app(const int N,SMI_Comm comm) { diff --git a/microbenchmarks/kernels/latency_0.cl b/microbenchmarks/kernels/latency_0.cl index fc1943e7..735ffebe 100644 --- a/microbenchmarks/kernels/latency_0.cl +++ b/microbenchmarks/kernels/latency_0.cl @@ -15,7 +15,7 @@ #include - +#include "smi_generated_device.cl" __kernel void app(const int N, char dest_rank,SMI_Comm comm) { diff --git a/microbenchmarks/kernels/latency_1.cl b/microbenchmarks/kernels/latency_1.cl index f7e91163..e14879fd 100644 --- a/microbenchmarks/kernels/latency_1.cl +++ b/microbenchmarks/kernels/latency_1.cl @@ -16,6 +16,7 @@ */ #include +#include "smi_generated_device.cl" __kernel void app(const int N, SMI_Comm comm) { diff --git a/microbenchmarks/kernels/multi_collectives.cl b/microbenchmarks/kernels/multi_collectives.cl index e0944d0e..f3155124 100644 --- a/microbenchmarks/kernels/multi_collectives.cl +++ b/microbenchmarks/kernels/multi_collectives.cl @@ -14,6 +14,7 @@ #include +#include "smi_generated_device.cl" __kernel void sequential_collectives(const int N, char root, __global volatile char *mem, SMI_Comm comm) { diff --git a/microbenchmarks/kernels/reduce.cl b/microbenchmarks/kernels/reduce.cl index e7671ea3..d4c6ebcb 100644 --- a/microbenchmarks/kernels/reduce.cl +++ b/microbenchmarks/kernels/reduce.cl @@ -6,6 +6,8 @@ #pragma OPENCL EXTENSION cl_intel_channels : enable #include +#include "smi_generated_device.cl" + __kernel void app(const int N, char root, __global volatile char *mem, SMI_Comm comm) { unsigned int my_rank=SMI_Comm_rank(comm); diff --git a/microbenchmarks/kernels/scatter.cl b/microbenchmarks/kernels/scatter.cl index 8a889b66..5f348c7d 100644 --- a/microbenchmarks/kernels/scatter.cl +++ b/microbenchmarks/kernels/scatter.cl @@ -6,7 +6,7 @@ #include - +#include "smi_generated_device.cl" __kernel void app(const int N, char root,__global char* mem, SMI_Comm comm) { diff --git a/source-rewriter/src/utils.cpp b/source-rewriter/src/utils.cpp index 78552409..aa4f5e6b 100644 --- a/source-rewriter/src/utils.cpp +++ b/source-rewriter/src/utils.cpp @@ -1,5 +1,7 @@ #include "utils.h" +#include +#include #include using namespace clang; diff --git a/source-rewriter/src/utils.h b/source-rewriter/src/utils.h index 04db40df..61ac23ac 100644 --- a/source-rewriter/src/utils.h +++ b/source-rewriter/src/utils.h @@ -1,6 +1,6 @@ #pragma once -#include +#include bool isKernelFunction(clang::FunctionDecl* decl); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 46cac02a..a4b8d957 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -26,18 +26,11 @@ add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src ${CMAKE_CURRENT_BINARY_DIR}/googletest-build EXCLUDE_FROM_ALL) -# The gtest/gtest_main targets carry header search path -# dependencies automatically when using CMake 2.8.11 or -# later. Otherwise we have to add them here ourselves. -if (CMAKE_VERSION VERSION_LESS 2.8.11) - include_directories("${gtest_SOURCE_DIR}/include") -endif() - -# Now simply link against gtest or gtest_main as needed. Eg -include_directories(${CMAKE_CURRENT_BINARY_DIR}) +#include the gtest header directories include_directories("${gtest_SOURCE_DIR}/include") -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -lgtest") +# Now simply link against gtest or gtest_main as needed. Eg +link_libraries(gtest_main) #p2p smi_target(test_p2p "${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p.json" "${CMAKE_CURRENT_SOURCE_DIR}/p2p/test_p2p.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p_rank0.cl;${CMAKE_CURRENT_SOURCE_DIR}/p2p/p2p_rank1.cl" 8) @@ -45,7 +38,7 @@ target_link_libraries(test_p2p_host gtest) add_test( NAME p2p - COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_p2p_host + COMMAND env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 test_p2p_host WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_p2p/" ) @@ -56,7 +49,7 @@ target_link_libraries(test_broadcast_host gtest) add_test( NAME broadcast - COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_broadcast_host + COMMAND env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 test_broadcast_host WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_broadcast/" ) @@ -66,7 +59,7 @@ target_link_libraries(test_reduce_host gtest) add_test( NAME reduce - COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_reduce_host + COMMAND env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 test_reduce_host WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_reduce/" ) @@ -76,7 +69,7 @@ target_link_libraries(test_scatter_host gtest) add_test( NAME scatter - COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_scatter_host + COMMAND env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 test_scatter_host WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_scatter/" ) @@ -85,7 +78,7 @@ target_link_libraries(test_gather_host gtest) add_test( NAME gather - COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_gather_host + COMMAND env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 test_gather_host WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_gather/" ) @@ -95,6 +88,6 @@ target_link_libraries(test_mixed_host gtest) add_test( NAME mixed - COMMAND env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 test_mixed_host + COMMAND env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 test_mixed_host WORKING_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/test_mixed/" ) diff --git a/test/README.md b/test/README.md index 4007bca1..99fb62cf 100644 --- a/test/README.md +++ b/test/README.md @@ -38,7 +38,7 @@ To test a primitive, in the `test` folder of the Cmake folder: 3. execute the test program from the respective working directory `test_/` - `env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test__host` + `env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./test__host` or simply use the integration with `ctest` diff --git a/test/broadcast/broadcast.cl b/test/broadcast/broadcast.cl index c8a371f4..87019721 100644 --- a/test/broadcast/broadcast.cl +++ b/test/broadcast/broadcast.cl @@ -5,6 +5,7 @@ #include +#include "smi_generated_device.cl" __kernel void test_int(__global char* mem, const int N, char root,SMI_Comm comm) { diff --git a/test/broadcast/test_broadcast.cpp b/test/broadcast/test_broadcast.cpp index aebb6fad..d022c92c 100644 --- a/test/broadcast/test_broadcast.cpp +++ b/test/broadcast/test_broadcast.cpp @@ -327,16 +327,20 @@ TEST(Broadcast, FloatMessagesAD) int main(int argc, char *argv[]) { - //std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " []" << std::endl; + //std::cerr << "Usage: [env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 " << argv[0] << " []" << std::endl; int result = 0; + bool emulation; ::testing::InitGoogleTest(&argc, argv); //delete listeners for all the rank except 0 - if(argc==2) + if(argc==2) { + emulation = false; program_path =argv[1]; - else + } else { + emulation = true; program_path="emulator_/broadcast.aocx"; + } ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); CHECK_MPI(MPI_Init(&argc, &argv)); @@ -350,7 +354,11 @@ int main(int argc, char *argv[]) //create environemnt int fpga=my_rank%2; program_path = replace(program_path, "", std::to_string(my_rank)); - context = new hlslib::ocl::Context(); + if (emulation) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } auto program = context->MakeProgram(program_path); std::vector> buffers; comm=SmiInit_broadcast(my_rank, rank_count, ROUTING_DIR, *context, program, buffers); diff --git a/test/gather/gather.cl b/test/gather/gather.cl index f10d2041..93e4d672 100644 --- a/test/gather/gather.cl +++ b/test/gather/gather.cl @@ -5,6 +5,8 @@ #include +#include "smi_generated_device.cl" + __kernel void test_char(const int N, char root, __global char *mem, SMI_Comm comm) { SMI_GatherChannel __attribute__((register)) chan= SMI_Open_gather_channel(N,N, SMI_CHAR,0, root,comm); diff --git a/test/gather/test_gather.cpp b/test/gather/test_gather.cpp index d4d10fe5..db5e61b6 100644 --- a/test/gather/test_gather.cpp +++ b/test/gather/test_gather.cpp @@ -3,7 +3,7 @@ Test must be executed with 8 ranks Once built, execute it with: - env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" + env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" */ @@ -232,16 +232,20 @@ TEST(Gather, FloatMessages) int main(int argc, char *argv[]) { -// std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " " << std::endl; +// std::cerr << "Usage: [env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 " << argv[0] << " " << std::endl; int result = 0; + bool emulation; ::testing::InitGoogleTest(&argc, argv); //delete listeners for all the rank except 0 - if(argc==2) + if(argc==2) { + emulation = false; program_path =argv[1]; - else + } else { + emulation = true; program_path = "emulator_/gather.aocx"; + } ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); CHECK_MPI(MPI_Init(&argc, &argv)); @@ -257,7 +261,11 @@ int main(int argc, char *argv[]) program_path = replace(program_path, "", std::to_string(my_rank)); program_path = replace(program_path, "", std::to_string(my_rank)); - context = new hlslib::ocl::Context(); + if (emulation) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } auto program = context->MakeProgram(program_path); std::vector> buffers; comm=SmiInit_gather(my_rank, rank_count, ROUTING_DIR, *context, program, buffers); diff --git a/test/mixed/mixed.cl b/test/mixed/mixed.cl index dec64454..eb79920b 100644 --- a/test/mixed/mixed.cl +++ b/test/mixed/mixed.cl @@ -8,6 +8,7 @@ The last, broadcasts the value */ #include +#include "smi_generated_device.cl" __kernel void test_int(int start, const SMI_Comm comm,__global int *mem) { unsigned int my_rank=SMI_Comm_rank(comm); diff --git a/test/mixed/test_mixed.cpp b/test/mixed/test_mixed.cpp index 915e787c..b5e49890 100644 --- a/test/mixed/test_mixed.cpp +++ b/test/mixed/test_mixed.cpp @@ -3,7 +3,7 @@ Test must be executed with 8 ranks Once built, execute it with: - env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" + env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" */ @@ -103,16 +103,20 @@ TEST(Gather, IntegerMessages) int main(int argc, char *argv[]) { -// std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " " << std::endl; +// std::cerr << "Usage: [env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 " << argv[0] << " " << std::endl; int result = 0; + bool emulation; ::testing::InitGoogleTest(&argc, argv); //delete listeners for all the rank except 0 - if(argc==2) + if(argc==2) { + emulation = false; program_path =argv[1]; - else + } else { + emulation = true; program_path = "emulator_/mixed.aocx"; + } ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); CHECK_MPI(MPI_Init(&argc, &argv)); @@ -126,7 +130,11 @@ int main(int argc, char *argv[]) //create environemnt int fpga=my_rank%2; program_path = replace(program_path, "", std::to_string(my_rank)); - context = new hlslib::ocl::Context(); + if (emulation) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } auto program = context->MakeProgram(program_path); std::vector> buffers; comm=SmiInit_mixed(my_rank, rank_count, ROUTING_DIR, *context, program, buffers); diff --git a/test/p2p/p2p_rank0.cl b/test/p2p/p2p_rank0.cl index f37198d8..731186a9 100644 --- a/test/p2p/p2p_rank0.cl +++ b/test/p2p/p2p_rank0.cl @@ -4,6 +4,7 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable #include +#include "smi_generated_device.cl" __kernel void test_char(const int N, const char dest_rank, const SMI_Comm comm) { diff --git a/test/p2p/p2p_rank1.cl b/test/p2p/p2p_rank1.cl index 03efc43a..118d2b9a 100644 --- a/test/p2p/p2p_rank1.cl +++ b/test/p2p/p2p_rank1.cl @@ -5,6 +5,7 @@ #pragma OPENCL EXTENSION cl_khr_fp64 : enable #include +#include "smi_generated_device.cl" __kernel void test_short(__global char *mem, const int N, SMI_Comm comm) { diff --git a/test/p2p/test_p2p.cpp b/test/p2p/test_p2p.cpp index a1ce0528..2547a7ca 100644 --- a/test/p2p/test_p2p.cpp +++ b/test/p2p/test_p2p.cpp @@ -580,7 +580,7 @@ int main(int argc, char *argv[]) if(argc<1) { - std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " [ and flags>]" << std::endl; + std::cerr << "Usage: [env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 " << argv[0] << " [ and flags>]" << std::endl; return -1; } @@ -588,10 +588,14 @@ int main(int argc, char *argv[]) ::testing::InitGoogleTest(&argc, argv); //delete listeners for all the rank except 0 - if(argc==2) + + bool emulation = false; + if(argc==2) { program_path =argv[1]; - else + } else { program_path = "emulator_/p2p_rank.aocx"; + emulation = true; + } ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); @@ -611,8 +615,15 @@ int main(int argc, char *argv[]) else program_path = replace(program_path, "", std::string("1")); - context = new hlslib::ocl::Context(); + if (emulation) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } auto program = context->MakeProgram(program_path); + if (emulation) { + //program.build(); + } std::vector> buffers; comm=SmiInit_p2p_rank0(my_rank, rank_count, ROUTING_DIR, *context, program, buffers); diff --git a/test/reduce/reduce.cl b/test/reduce/reduce.cl index e9531c1d..1fe86962 100644 --- a/test/reduce/reduce.cl +++ b/test/reduce/reduce.cl @@ -4,6 +4,9 @@ */ #include + +#include "smi_generated_device.cl" + __kernel void test_float_add(const int N, char root, __global volatile char *mem, SMI_Comm comm) { unsigned int my_rank=SMI_Comm_rank(comm); diff --git a/test/reduce/test_reduce.cpp b/test/reduce/test_reduce.cpp index 9811b1c0..d4ed24fb 100644 --- a/test/reduce/test_reduce.cpp +++ b/test/reduce/test_reduce.cpp @@ -3,7 +3,7 @@ Test must be executed with 8 ranks Once built, execute it with: - env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" + env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" */ #define TEST_TIMEOUT 30 @@ -357,16 +357,20 @@ int main(int argc, char *argv[]) { - // std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " \"\"" << std::endl; + // std::cerr << "Usage: [env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 " << argv[0] << " \"\"" << std::endl; int result = 0; + bool emulation; ::testing::InitGoogleTest(&argc, argv); //delete listeners for all the rank except 0 - if(argc==2) + if(argc==2) { + emulation = false; program_path =argv[1]; - else + } else { + emulation = true; program_path="emulator_/reduce.aocx"; + } ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); CHECK_MPI(MPI_Init(&argc, &argv)); @@ -380,8 +384,11 @@ int main(int argc, char *argv[]) //create environemnt int fpga=my_rank%2; program_path = replace(program_path, "", std::to_string(my_rank)); - context = new hlslib::ocl::Context(); - + if (emulation) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } auto program = context->MakeProgram(program_path); std::vector> buffers; comm=SmiInit_reduce(my_rank, rank_count, ROUTING_DIR, *context, program, buffers); diff --git a/test/scatter/scatter.cl b/test/scatter/scatter.cl index 8cbdb368..0e9080d3 100644 --- a/test/scatter/scatter.cl +++ b/test/scatter/scatter.cl @@ -5,7 +5,7 @@ #include - +#include "smi_generated_device.cl" __kernel void test_int(const int N, char root,__global char* mem, SMI_Comm comm) { diff --git a/test/scatter/test_scatter.cpp b/test/scatter/test_scatter.cpp index 86ea80a7..b555d459 100644 --- a/test/scatter/test_scatter.cpp +++ b/test/scatter/test_scatter.cpp @@ -3,7 +3,7 @@ Test must be executed with 8 ranks Once built, execute it with: - env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" + env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./test_gather.exe "./gather_emulator_.aocx" */ #define TEST_TIMEOUT 15 @@ -207,15 +207,19 @@ TEST(Scatter, DoubleMessages) int main(int argc, char *argv[]) { -// std::cerr << "Usage: [env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 " << argv[0] << " " << std::endl; +// std::cerr << "Usage: [env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 " << argv[0] << " " << std::endl; int result = 0; + bool emulation; ::testing::InitGoogleTest(&argc, argv); //delete listeners for all the rank except 0 - if(argc==2) + if(argc==2) { + emulation = false; program_path =argv[1]; - else + } else { + emulation = true; program_path = "emulator_/scatter.aocx"; + } ::testing::TestEventListeners& listeners = ::testing::UnitTest::GetInstance()->listeners(); CHECK_MPI(MPI_Init(&argc, &argv)); @@ -229,7 +233,11 @@ int main(int argc, char *argv[]) //create environemnt int fpga=my_rank%2; program_path = replace(program_path, "", std::to_string(my_rank)); - context = new hlslib::ocl::Context(); + if (emulation) { + context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0); + } else { + context = new hlslib::ocl::Context(VENDOR_STRING, 0); + } auto program = context->MakeProgram(program_path); std::vector> buffers;