Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeGTEST.txt.in
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ project(googletest-download NONE)
include(ExternalProject)
ExternalProject_Add(googletest
GIT_REPOSITORY https://github.com/google/googletest.git
GIT_TAG master
GIT_TAG release-1.11.0
SOURCE_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
CONFIGURE_COMMAND ""
Expand Down
16 changes: 6 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ set(SMI_REWRITER "${CMAKE_BINARY_DIR}/source-rewriter/rewriter")
set(SMI_FMAX "480" CACHE STRING "Target Fmax when compiling for hardware.")
set(SMI_DEVICES_PER_NODE 2 CACHE STRING "Number of FPGA devices per node.")

set(CL_HPP_TARGET_OPENCL_VERSION 200 CACHE STRING "CL Version")
option (ENABLE_TESTS "Enables testing" OFF)

# Dependencies
Expand Down Expand Up @@ -106,21 +107,18 @@ function(smi_target TARGET_NAME CONNECTION_FILE HOST_SOURCE KERNELS NUM_RANKS)
)
add_dependencies(${KERNEL_TARGET} rewriter)

# compile FPGA code
set(FPGA_SRC_FILES "${SMI_GENERATED_PATH};${KERNEL_GENERATED_PATH}")

# generate report
set(FPGA_REPORT_TARGET ${TARGET_NAME}_${KERNEL_NAME}_aoc_report)
add_custom_target(${FPGA_REPORT_TARGET}
COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${FPGA_SRC_FILES} -rtl -report
COMMAND ${IntelFPGAOpenCL_AOC} ${KERNEL_GENERATED_PATH} -I${KERNEL_BIN_DIR} ${AOC_COMMAND} -rtl -report
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this change necessary?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The new compiler version is not able to compile two cl files together, so we switched to compiling only the kernel file and then include the smi_generated_device.cl from there.

WORKING_DIRECTORY ${KERNEL_BIN_DIR}
)
add_dependencies(${FPGA_REPORT_TARGET} ${KERNEL_TARGET})

# build hardware
set(FPGA_BUILD_TARGET ${TARGET_NAME}_${KERNEL_NAME}_aoc_build)
add_custom_target(${FPGA_BUILD_TARGET}
COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${FPGA_SRC_FILES}
COMMAND ${IntelFPGAOpenCL_AOC} ${KERNEL_GENERATED_PATH} -I${KERNEL_BIN_DIR} ${AOC_COMMAND}
WORKING_DIRECTORY ${KERNEL_BIN_DIR}
)
add_dependencies(${FPGA_BUILD_TARGET} ${KERNEL_TARGET})
Expand Down Expand Up @@ -179,15 +177,13 @@ function(smi_target TARGET_NAME CONNECTION_FILE HOST_SOURCE KERNELS NUM_RANKS)

list(GET FPGA_SOURCES ${KERNEL_INDEX} USER_DEVICE_SRC)
list(GET FPGA_GENERATED_SOURCES ${KERNEL_INDEX} SMI_DEVICE_SRC)
set(FPGA_SRC_FILES "${SMI_DEVICE_SRC};${USER_DEVICE_SRC}")

set(EMULATION_WORKDIR ${WORKDIR}/emulator_${SMI_EMULATION_RANK})
file(MAKE_DIRECTORY ${EMULATION_WORKDIR})
set(EMULATOR_TARGET_RANK ${EMULATOR_TARGET}_${SMI_EMULATION_RANK})
add_custom_target(${EMULATOR_TARGET_RANK}
COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${FPGA_SRC_FILES} -march=emulator
COMMAND ${IntelFPGAOpenCL_AOC} ${USER_DEVICE_SRC} -I${KERNEL_BIN_DIR} ${AOC_COMMAND} -march=emulator
-DSMI_EMULATION_RANK=${SMI_EMULATION_RANK}
-emulator-channel-depth-model=strict
WORKING_DIRECTORY ${EMULATION_WORKDIR}
)
add_dependencies(${EMULATOR_TARGET} ${EMULATOR_TARGET_RANK})
Expand Down Expand Up @@ -245,7 +241,7 @@ function(fpga_target TARGET_NAME HOST_SOURCE KERNEL GENERATE_KERNEL)
COMMAND ${IntelFPGAOpenCL_AOC} ${AOC_COMMAND} ${KERNEL}
WORKING_DIRECTORY ${KERNEL_BIN_DIR}
)
if(USE_CODEGEN)
if(GENERATE_KERNEL)
add_dependencies(${FPGA_BUILD_TARGET} generate_${KERNEL_NAME})
endif()

Expand All @@ -267,7 +263,7 @@ function(fpga_target TARGET_NAME HOST_SOURCE KERNEL GENERATE_KERNEL)
-emulator-channel-depth-model=strict
WORKING_DIRECTORY ${EMULATION_WORKDIR}
)
if(USE_CODEGEN)
if(GENERATE_KERNEL)
add_dependencies(${EMULATOR_TARGET} generate_${KERNEL_NAME})
endif()
endfunction()
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ make stencil_smi_emulator -j
make stencil_smi_host
cd stencil_smi
# Execute the program
env CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA=8 mpirun -np 8 ./stencil_smi_host emulator <num-timesteps>
env CL_CONFIG_CPU_EMULATE_DEVICES=8 mpirun -np 8 ./stencil_smi_host emulator <num-timesteps>
```

To generate the report, from the `examples` directory in the CMake folder, the user must execute:
Expand Down
2 changes: 1 addition & 1 deletion codegen/templates/host.cl
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ SMI_Comm SmiInit_{{ name }}(
const int num_kernels = kernel_names.size();
for (int i = num_kernels - 1; i >= 0; i--)
{
queues[i].enqueueTask(kernels[i]);
queues[i].enqueueNDRangeKernel(kernels[i], cl::NullRange, cl::NDRange(1));
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there any advantage in using eneuqueNDRangeKernel?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

enqueueTask is no more available in cl2

queues[i].flush();
}

Expand Down
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ if(PythonInterp_FOUND)

#onchip versions
fpga_target(gesummv_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/gesummv_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/gesummv_onchip.cl" OFF)
target_link_libraries(gesummv_onchip_host openblas)
fpga_target(stencil_onchip "${CMAKE_CURRENT_SOURCE_DIR}/host/stencil_onchip.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/kernels/stencil_onchip.cl" ON)

endif()
7 changes: 6 additions & 1 deletion examples/host/gesummv_onchip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,11 @@
#include <utils/utils.hpp>
#define TILE_SIZE 128 //define this as used in the opencl kernel

#if !defined(CL_CHANNEL_1_INTELFPGA)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think that this should not be necessary

// include this header if channel macros are not defined in cl.hpp (versions >=19.0)
#include "CL/cl_ext_intelfpga.h"
#endif

using namespace std;
float *A,*B,*x,*y;
float *fpga_res_y;
Expand Down Expand Up @@ -150,7 +155,7 @@ void testStreamed(std::string program_path,int n, int m, float alpha, float beta
comp_start=current_time_usecs();
asm volatile("": : :"memory");
for(int i=0;i<kernel_names.size();i++)
queues[i].enqueueTask(kernels[i],nullptr,&events[i]);
queues[i].enqueueNDRangeKernel(kernels[i],cl::NullRange,cl::NDRange(1));
for(int i=0;i<kernel_names.size();i++)
queues[i].finish();

Expand Down
26 changes: 18 additions & 8 deletions examples/host/gesummv_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
#include "smi_generated_host.c"
#define ROUTING_DIR "smi-routes/"

#if !defined(CL_CHANNEL_1_INTELFPGA)
// include this header if channel macros are not defined in cl.hpp (versions >=19.0)
#include "CL/cl_ext_intelfpga.h"
#endif

using namespace std;
float *A,*B,*x,*y;
float *fpga_res_y;
Expand Down Expand Up @@ -99,7 +104,7 @@ int main(int argc, char *argv[])
case 'r':
runs=atoi(optarg);
break;
case'k':
case 'k':
{
rank=atoi(optarg);
if(rank!=0 && rank!=1)
Expand Down Expand Up @@ -166,10 +171,15 @@ int main(int argc, char *argv[])
generate_float_matrix(B,n,m);


hlslib::ocl::Context context(fpga);
auto program = context.MakeProgram(program_path);
hlslib::ocl::Context *context;
if (emulator) {
context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, fpga);
} else {
context = new hlslib::ocl::Context(VENDOR_STRING, fpga);
}
auto program = context->MakeProgram(program_path);
std::vector<hlslib::ocl::Buffer<char, hlslib::ocl::Access::read>> buffers;
SMI_Comm comm=SmiInit_gesummv_rank0(rank, rank_count, ROUTING_DIR, context, program, buffers);
SMI_Comm comm=SmiInit_gesummv_rank0(rank, rank_count, ROUTING_DIR, *context, program, buffers);


int tile_size=128;
Expand All @@ -182,10 +192,10 @@ int main(int argc, char *argv[])

// Create device buffers
size_t elem_per_module=n*m/2;
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> input_x = context.MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank2, m);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> output_y = context.MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank3, n);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> input_M_0 = context.MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank0, elem_per_module);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> input_M_1 = context.MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank1, elem_per_module);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> input_x = context->MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank2, m);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> output_y = context->MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank3, n);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> input_M_0 = context->MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank0, elem_per_module);
hlslib::ocl::Buffer<float, hlslib::ocl::Access::readWrite> input_M_1 = context->MakeBuffer<float, hlslib::ocl::Access::readWrite>(hlslib::ocl::MemoryBank::bank1, elem_per_module);


// Create kernels
Expand Down
28 changes: 17 additions & 11 deletions examples/host/kmeans_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
#include "hlslib/intel/OpenCL.h"
#include "kmeans.h"
#include "common.h"
#include <utils/ocl_utils.hpp>
#define __HOST_PROGRAM__

#include <smi/communicator.h>
Expand Down Expand Up @@ -56,7 +57,7 @@ int main(int argc, char **argv) {
std::string mode_str(argv[1]);
std::string kernel_path;
if (mode_str == "emulator") {
setenv("CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA", "1", false);
setenv("CL_CONFIG_CPU_EMULATE_DEVICES", "1", false);
emulator = true;
// In emulation mode, each rank has its own kernel file
kernel_path =
Expand Down Expand Up @@ -167,33 +168,38 @@ int main(int argc, char **argv) {

try {

MPIStatus(mpi_rank, "Creating OpenCL context...\n");
hlslib::ocl::Context context(emulator ? 0 : (mpi_rank % kDevicesPerNode));
MPIStatus(mpi_rank, "Creating OpenCL conext...\n");
hlslib::ocl::Context *context;
if (emulator) {
context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0);
} else {
context = new hlslib::ocl::Context(VENDOR_STRING, (mpi_rank % kDevicesPerNode));
}

MPIStatus(mpi_rank, "Allocating and copying device memory...\n");
auto points_device = context.MakeBuffer<Data_t, hlslib::ocl::Access::read>(
auto points_device = context->MakeBuffer<Data_t, hlslib::ocl::Access::read>(
points.cbegin(), points.cend());
auto centroids_device_read =
context.MakeBuffer<Data_t, hlslib::ocl::Access::read>(centroids.cbegin(),
context->MakeBuffer<Data_t, hlslib::ocl::Access::read>(centroids.cbegin(),
centroids.cend());
auto centroids_device_write =
context.MakeBuffer<Data_t, hlslib::ocl::Access::write>(centroids.cbegin(),
context->MakeBuffer<Data_t, hlslib::ocl::Access::write>(centroids.cbegin(),
centroids.cend());
std::vector<hlslib::ocl::Buffer<char, hlslib::ocl::Access::read>>
routing_tables_cks_device(kChannelsPerRank);
std::vector<hlslib::ocl::Buffer<char, hlslib::ocl::Access::read>>
routing_tables_ckr_device(kChannelsPerRank);
for (int i = 0; i < kChannelsPerRank; ++i) {
routing_tables_cks_device[i] =
context.MakeBuffer<char, hlslib::ocl::Access::read>(
context->MakeBuffer<char, hlslib::ocl::Access::read>(
routing_tables_cks[i].cbegin(), routing_tables_cks[i].cend());
routing_tables_ckr_device[i] =
context.MakeBuffer<char, hlslib::ocl::Access::read>(
context->MakeBuffer<char, hlslib::ocl::Access::read>(
routing_tables_ckr[i].cbegin(), routing_tables_ckr[i].cend());
}

MPIStatus(mpi_rank, "Creating program from binary...\n");
auto program = context.MakeProgram(kernel_path);
auto program = context->MakeProgram(kernel_path);

MPIStatus(mpi_rank, "Starting communication kernels...\n");
std::vector<hlslib::ocl::Kernel> comm_kernels;
Expand Down Expand Up @@ -251,7 +257,7 @@ int main(int argc, char **argv) {

//for (auto &k : kernels) {
for(int i=0;i<3;i++){
//futures.emplace_back(k.ExecuteTaskAsync()); //HLSLIB
//futures.emplace_back(k.ExecuteTaskFork()); //HLSLIB
cl::CommandQueue queue=kernels[i].commandQueue();
queue.enqueueTask(kernels[i].kernel(),nullptr, &events[i]);
//queue.flush();
Expand All @@ -264,7 +270,7 @@ int main(int argc, char **argv) {
}*/
//for (auto &k : kernels) {
for(int i=0;i<3;i++){
//futures.emplace_back(k.ExecuteTaskAsync()); HLSLIB
//futures.emplace_back(k.ExecuteTaskFork()); HLSLIB
//cl::CommandQueue queue=k.commandQueue();
//queue.finish();
events[i].wait();
Expand Down
17 changes: 12 additions & 5 deletions examples/host/stencil_onchip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <vector>
#include "hlslib/intel/OpenCL.h"
#include "stencil.h"
#include <utils/ocl_utils.hpp>

// Convert from C to C++
using Data_t = DTYPE;
Expand Down Expand Up @@ -78,9 +79,10 @@ int main(int argc, char **argv) {
std::string kernel_path;
if (mode_str == "emulator") {
emulator = true;
kernel_path = "stencil_spatial_tiling_emulator.aocx";
kernel_path = "emulator/stencil_onchip.aocx";
} else if (mode_str == "hardware") {
kernel_path = "stencil_spatial_tiling_hardware.aocx";
// TODO: find the right path
kernel_path = "stencil_onchip.aocx";
emulator = false;
} else {
std::cout << kUsage;
Expand All @@ -105,10 +107,15 @@ int main(int argc, char **argv) {

// Create OpenCL kernels
std::cout << "Creating OpenCL context...\n" << std::flush;
hlslib::ocl::Context context;
hlslib::ocl::Context *context;
if (emulator) {
context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0);
} else {
context = new hlslib::ocl::Context(VENDOR_STRING, 0);
}
std::cout << "Allocating device memory...\n" << std::flush;
std::cout << "Creating program from binary...\n" << std::flush;
auto program = context.MakeProgram(kernel_path);
auto program = context->MakeProgram(kernel_path);
std::cout << "Creating kernels...\n" << std::flush;
std::vector<hlslib::ocl::Kernel> kernels;
std::vector<hlslib::ocl::Buffer<Data_t, hlslib::ocl::Access::readWrite>>
Expand All @@ -119,7 +126,7 @@ int main(int argc, char **argv) {
for (int i = 0; i < kPX; ++i) {
for (int j = 0; j < kPY; ++j) {
auto device_buffer =
context.MakeBuffer<Data_t, hlslib::ocl::Access::readWrite>(
context->MakeBuffer<Data_t, hlslib::ocl::Access::readWrite>(
banks[(i * kPY + j) % banks.size()], 2 * kXLocal * kYLocal);
const std::string suffix("_" + std::to_string(i) + "_" +
std::to_string(j));
Expand Down
22 changes: 14 additions & 8 deletions examples/host/stencil_smi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include "common.h"
#define __HOST_PROGRAM__
#include "hlslib/intel/OpenCL.h"
#include <utils/ocl_utils.hpp>
#include "stencil.h"
#include <smi/communicator.h>

Expand All @@ -24,7 +25,7 @@ constexpr int kXLocal = kX / kPX;
constexpr int kYLocal = kY / kPY;
constexpr auto kDevicesPerNode = SMI_DEVICES_PER_NODE;
constexpr auto kUsage =
"Usage: ./stencil_smi_interleaved <[emulator/hardware]> <num timesteps>\n";
"Usage: ./stencil_smi <[emulator/hardware]> <num timesteps>\n";

using AlignedVec_t =
std::vector<Data_t, hlslib::ocl::AlignedAllocator<Data_t, 64>>;
Expand Down Expand Up @@ -143,10 +144,10 @@ int main(int argc, char **argv) {
std::string mode_str(argv[1]);
std::string kernel_path;
if (mode_str == "emulator") {
setenv("CL_CONTEXT_EMULATOR_DEVICE_INTELFPGA", "1", false);
setenv("CL_CONFIG_CPU_EMULATE_DEVICES", "1", false);
emulator = true;
// In emulation mode, each rank has its own kernel file
kernel_path = ("emulator_" + std::to_string(mpi_rank) + "/stencil_smi_interleaved.aocx");
kernel_path = ("emulator_" + std::to_string(mpi_rank) + "/stencil_smi.aocx");
} else if (mode_str == "hardware") {
kernel_path = "stencil_smi/stencil_smi.aocx";
emulator = false;
Expand Down Expand Up @@ -205,9 +206,14 @@ int main(int argc, char **argv) {

MPIStatus(mpi_rank, "Creating OpenCL context...\n");
try {
hlslib::ocl::Context context(emulator ? 0 : (mpi_rank % kDevicesPerNode));
hlslib::ocl::Context *context;
if (emulator) {
context = new hlslib::ocl::Context(VENDOR_STRING_EMULATION, 0);
} else {
context = new hlslib::ocl::Context(VENDOR_STRING, (mpi_rank % kDevicesPerNode));
}
MPIStatus(mpi_rank, "Creating program from binary...\n");
auto program = context.MakeProgram(kernel_path);
auto program = context->MakeProgram(kernel_path);

MPI_Barrier(MPI_COMM_WORLD);

Expand All @@ -219,7 +225,7 @@ int main(int argc, char **argv) {
device_buffers;
for (int b = 0; b < kMemoryBanks; ++b) {
auto device_buffer =
context.MakeBuffer<Data_t, hlslib::ocl::Access::readWrite>(
context->MakeBuffer<Data_t, hlslib::ocl::Access::readWrite>(
banks[b % banks.size()], 2 * kXLocal * kYLocal / kMemoryBanks);
device_buffer.CopyFromHost(0, kXLocal * kYLocal / kMemoryBanks,
interleaved_host[b].cbegin());
Expand All @@ -234,10 +240,10 @@ int main(int argc, char **argv) {
routing_tables_ckr_device(kChannelsPerRank);
for (int i = 0; i < kChannelsPerRank; ++i) {
routing_tables_cks_device[i] =
context.MakeBuffer<char, hlslib::ocl::Access::read>(
context->MakeBuffer<char, hlslib::ocl::Access::read>(
routing_tables_cks[i].cbegin(), routing_tables_cks[i].cend());
routing_tables_ckr_device[i] =
context.MakeBuffer<char, hlslib::ocl::Access::read>(
context->MakeBuffer<char, hlslib::ocl::Access::read>(
routing_tables_ckr[i].cbegin(), routing_tables_ckr[i].cend());
}

Expand Down
1 change: 1 addition & 0 deletions examples/kernels/gesummv_rank0.cl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

*/

#include "smi_generated_device.cl"
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These include (and the ones in the successive file) should not be necessary with the old CMakeList


#pragma OPENCL EXTENSION cl_intel_channels : enable

Expand Down
1 change: 1 addition & 0 deletions examples/kernels/gesummv_rank1.cl
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

*/

#include "smi_generated_device.cl"

#pragma OPENCL EXTENSION cl_intel_channels : enable

Expand Down
Loading