diff --git a/.circleci/config.yml b/.circleci/config.yml index 5774601c..e49604ab 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -7,18 +7,18 @@ version: 2 jobs: build: docker: - - image: fedora:33 + - image: fedora:34 working_directory: /hpx/ steps: - checkout - run: name: Install dependencies - command: dnf update -y && dnf install -y doxygen libpng-devel hpx-devel cmake + command: dnf update -y && dnf install -y doxygen libpng-devel hpx-devel cmake clang llvm-devel - run: name: Install cuda command: | dnf install -y 'dnf-command(config-manager)' && \ - dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora33/x86_64/cuda-fedora33.repo && \ + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/fedora34/x86_64/cuda-fedora34.repo && \ dnf clean expire-cache && \ dnf update -y && \ dnf module install -y nvidia-driver:latest-dkms && \ @@ -29,8 +29,9 @@ jobs: - run: name: Configure command: | + echo $(find /usr/local -name "cuda*") \ cd /hpx/build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release -DHPX_WITH_MALLOC=system -DHPXCL_WITH_OPENCL=OFF -DHPXCL_WITH_CUDA=On -DHPXCL_WITH_BENCHMARK=On -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-11.4 + cmake .. -DCMAKE_BUILD_TYPE=Release -DHPX_WITH_MALLOC=system -DHPXCL_WITH_OPENCL=OFF -DHPXCL_WITH_CUDA=On -DHPXCL_WITH_BENCHMARK=On -DCUDA_TOOLKIT_ROOT_DIR=$(find /usr/local -name "cuda*") - run: name: Build command: cd /hpx/build && make -j 2 && make install diff --git a/CMakeLists.txt b/CMakeLists.txt index 3bf0e9d5..508ae4ff 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ # file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) # Require a minimum version of CMake -cmake_minimum_required(VERSION 3.3.2 FATAL_ERROR) +cmake_minimum_required(VERSION 3.17 FATAL_ERROR) if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING diff --git a/benchmark/cuda/dgemm/dgemmHPXCL.cpp b/benchmark/cuda/dgemm/dgemmHPXCL.cpp index 0c754ff6..ae28a5ed 100644 --- a/benchmark/cuda/dgemm/dgemmHPXCL.cpp +++ b/benchmark/cuda/dgemm/dgemmHPXCL.cpp @@ -15,193 +15,188 @@ using namespace hpx::cuda; //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char*argv[]) { +int main(int argc, char* argv[]) { + if (argc != 4) { + std::cout << "Usage: " << argv[0] << " #m #n #k"; + exit(1); + } - if (argc != 4) { - std::cout << "Usage: " << argv[0] << " #m #n #k"; - exit(1); - } - - int m, n, k, i; - - //Initilizing the matrix dimensions - m = atoi(argv[1]); - n = atoi(argv[2]); - k = atoi(argv[3]); - - double time = 0; - timer_start(); - - //Vector for all futures for the data management - std::vector> data_futures; - - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - double *A, *B, *C; - - double alpha, beta; - - //initializing values of alpha and beta - alpha = 1.0; - beta = 0.0; - - //Malloc Host - cudaMallocHost((void**) &A, m * k * sizeof(double)); - checkCudaError("Malloc A"); - cudaMallocHost((void**) &B, n * k * sizeof(double)); - checkCudaError("Malloc B"); - cudaMallocHost((void**) &C, m * n * sizeof(double)); - checkCudaError("Malloc C"); - - time += timer_stop(); - //printf (" Intializing matrix data \n\n"); - timer_start(); - - for (i = 0; i < (m * k); i++) { - A[i] = (double) (i + 1); - } - - for (i = 0; i < (k * n); i++) { - B[i] = (double) (-i - 1); - } - - for (i = 0; i < (m * n); i++) { - C[i] = 0.0; - } - - //creating vector of futures - std::vector> kernelFutures; - - std::vector args; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; - - block.x = 32; - block.y = 32; - block.z = 1; - - grid.x = 1 + std::ceil(m / block.x); - grid.y = 1 + std::ceil(n / block.y); - grid.z = 1; - - std::vector> progBuildVector; - std::vector progVector; - std::vector deviceVector; - - //Creating the first device found - device cudaDevice = devices[0]; - - //Create a Mandelbrot device program - hpx::lcos::future fProg = cudaDevice.create_program_with_file( - "dgemm.cu"); - - //Compile with the kernal - std::vector < std::string > flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); - - flags.push_back(mode); - - program prog = fProg.get(); - progBuildVector.push_back(prog.build(flags, "dgemm")); - progVector.push_back(prog); - deviceVector.push_back(cudaDevice); - - //wait for program to build on all devices - hpx::wait_all(progBuildVector); - - //creating buffers - hpx::lcos::future fABuffer = cudaDevice.create_buffer( - m * k * sizeof(double)); - hpx::lcos::future fBBuffer = cudaDevice.create_buffer( - n * k * sizeof(double)); - hpx::lcos::future fCBuffer = cudaDevice.create_buffer( - m * n * sizeof(double)); - hpx::lcos::future falphaBuffer = - cudaDevice.create_buffer(sizeof(double)); - hpx::lcos::future fbetaBuffer = cudaDevice.create_buffer( - sizeof(double)); - hpx::lcos::future fmBuffer = cudaDevice.create_buffer( - sizeof(int)); - hpx::lcos::future fnBuffer = cudaDevice.create_buffer( - sizeof(int)); - hpx::lcos::future fkBuffer = cudaDevice.create_buffer( - sizeof(int)); - - buffer ABuffer = fABuffer.get(); - data_futures.push_back(ABuffer.enqueue_write(0, m * k * sizeof(double), A)); - buffer BBuffer = fBBuffer.get(); - data_futures.push_back(BBuffer.enqueue_write(0, n * k * sizeof(double), B)); - buffer CBuffer = fCBuffer.get(); - data_futures.push_back(CBuffer.enqueue_write(0, m * n * sizeof(double), C)); - buffer mBuffer = fmBuffer.get(); - data_futures.push_back(mBuffer.enqueue_write(0, sizeof(int), &m)); - buffer nBuffer = fnBuffer.get(); - data_futures.push_back(nBuffer.enqueue_write(0, sizeof(int), &n)); - buffer kBuffer = fkBuffer.get(); - data_futures.push_back(kBuffer.enqueue_write(0, sizeof(int), &k)); - buffer alphaBuffer = falphaBuffer.get(); - data_futures.push_back( - alphaBuffer.enqueue_write(0, sizeof(double), &alpha)); - buffer betaBuffer = fbetaBuffer.get(); - data_futures.push_back(betaBuffer.enqueue_write(0, sizeof(double), &beta)); - - //Synchronize copy to buffer - hpx::wait_all(data_futures); - - args.push_back(ABuffer); - args.push_back(BBuffer); - args.push_back(CBuffer); - args.push_back(mBuffer); - args.push_back(nBuffer); - args.push_back(kBuffer); - args.push_back(alphaBuffer); - args.push_back(betaBuffer); - - //Synchronize data transfer before new data is written - hpx::wait_all(args); - - //run the program on the device + int m, n, k, i; + + // Initilizing the matrix dimensions + m = atoi(argv[1]); + n = atoi(argv[2]); + k = atoi(argv[3]); + + double time = 0; + timer_start(); + + // Vector for all futures for the data management + std::vector> data_futures; + + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } + + double *A, *B, *C; + + double alpha, beta; + + // initializing values of alpha and beta + alpha = 1.0; + beta = 0.0; + + // Malloc Host + cudaMallocHost((void**)&A, m * k * sizeof(double)); + checkCudaError("Malloc A"); + cudaMallocHost((void**)&B, n * k * sizeof(double)); + checkCudaError("Malloc B"); + cudaMallocHost((void**)&C, m * n * sizeof(double)); + checkCudaError("Malloc C"); + + time += timer_stop(); + // printf (" Intializing matrix data \n\n"); + timer_start(); + + for (i = 0; i < (m * k); i++) { + A[i] = (double)(i + 1); + } + + for (i = 0; i < (k * n); i++) { + B[i] = (double)(-i - 1); + } + + for (i = 0; i < (m * n); i++) { + C[i] = 0.0; + } + + // creating vector of futures + std::vector> kernelFutures; + + std::vector args; + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + block.x = 32; + block.y = 32; + block.z = 1; + + grid.x = 1 + std::ceil(m / block.x); + grid.y = 1 + std::ceil(n / block.y); + grid.z = 1; + + std::vector> progBuildVector; + std::vector progVector; + std::vector deviceVector; + + // Creating the first device found + device cudaDevice = devices[0]; + + // Create a Mandelbrot device program + hpx::lcos::future fProg = + cudaDevice.create_program_with_file("dgemm.cu"); + + // Compile with the kernal + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); + + flags.push_back(mode); + + program prog = fProg.get(); + progBuildVector.push_back(prog.build(flags, "dgemm")); + progVector.push_back(prog); + deviceVector.push_back(cudaDevice); + + // wait for program to build on all devices + hpx::wait_all(progBuildVector); + + // creating buffers + hpx::lcos::future fABuffer = + cudaDevice.create_buffer(m * k * sizeof(double)); + hpx::lcos::future fBBuffer = + cudaDevice.create_buffer(n * k * sizeof(double)); + hpx::lcos::future fCBuffer = + cudaDevice.create_buffer(m * n * sizeof(double)); + hpx::lcos::future falphaBuffer = + cudaDevice.create_buffer(sizeof(double)); + hpx::lcos::future fbetaBuffer = + cudaDevice.create_buffer(sizeof(double)); + hpx::lcos::future fmBuffer = + cudaDevice.create_buffer(sizeof(int)); + hpx::lcos::future fnBuffer = + cudaDevice.create_buffer(sizeof(int)); + hpx::lcos::future fkBuffer = + cudaDevice.create_buffer(sizeof(int)); + + buffer ABuffer = fABuffer.get(); + data_futures.push_back(ABuffer.enqueue_write(0, m * k * sizeof(double), A)); + buffer BBuffer = fBBuffer.get(); + data_futures.push_back(BBuffer.enqueue_write(0, n * k * sizeof(double), B)); + buffer CBuffer = fCBuffer.get(); + data_futures.push_back(CBuffer.enqueue_write(0, m * n * sizeof(double), C)); + buffer mBuffer = fmBuffer.get(); + data_futures.push_back(mBuffer.enqueue_write(0, sizeof(int), &m)); + buffer nBuffer = fnBuffer.get(); + data_futures.push_back(nBuffer.enqueue_write(0, sizeof(int), &n)); + buffer kBuffer = fkBuffer.get(); + data_futures.push_back(kBuffer.enqueue_write(0, sizeof(int), &k)); + buffer alphaBuffer = falphaBuffer.get(); + data_futures.push_back(alphaBuffer.enqueue_write(0, sizeof(double), &alpha)); + buffer betaBuffer = fbetaBuffer.get(); + data_futures.push_back(betaBuffer.enqueue_write(0, sizeof(double), &beta)); + + // Synchronize copy to buffer + hpx::wait_all(data_futures); + + args.push_back(ABuffer); + args.push_back(BBuffer); + args.push_back(CBuffer); + args.push_back(mBuffer); + args.push_back(nBuffer); + args.push_back(kBuffer); + args.push_back(alphaBuffer); + args.push_back(betaBuffer); + + // Synchronize data transfer before new data is written + hpx::wait_all(args); + + // run the program on the device #ifdef HPXCL_CUDA_WITH_STREAMS - kernelFutures.push_back(prog.run(args, "dgemm", grid, block, 0)); + kernelFutures.push_back(prog.run(args, "dgemm", grid, block, 0)); #else - kernelFutures.push_back(prog.run(args, "dgemm", grid, block,0)); + kernelFutures.push_back(prog.run(args, "dgemm", grid, block, 0)); #endif - //wait for all the kernal futures to return - hpx::wait_all(kernelFutures); - + // wait for all the kernal futures to return + hpx::wait_all(kernelFutures); - double* res = CBuffer.enqueue_read_sync(0, n * m * sizeof(double)); + double* res = CBuffer.enqueue_read_sync(0, n * m * sizeof(double)); - //Printing the end timing result - time += timer_stop(); - std::cout << time << " "; + // Printing the end timing result + time += timer_stop(); + std::cout << time << " "; - // Validating the result - std::cout << validateDgemm(A, B, res, alpha, beta, n, m, k) << std::endl; + // Validating the result + std::cout << validateDgemm(A, B, res, alpha, beta, n, m, k) << std::endl; - //Free Memory - args.clear(); - cudaFreeHost(A); - checkCudaError("Free A"); - cudaFreeHost(B); - checkCudaError("Free B"); - cudaFreeHost(C); - checkCudaError("Free C"); + // Free Memory + args.clear(); + cudaFreeHost(A); + checkCudaError("Free A"); + cudaFreeHost(B); + checkCudaError("Free B"); + cudaFreeHost(C); + checkCudaError("Free C"); - return 0; + return 0; } diff --git a/benchmark/cuda/smvp/smvpHPXCL.cpp b/benchmark/cuda/smvp/smvpHPXCL.cpp index cedac487..232bef33 100644 --- a/benchmark/cuda/smvp/smvpHPXCL.cpp +++ b/benchmark/cuda/smvp/smvpHPXCL.cpp @@ -15,232 +15,229 @@ using namespace hpx::cuda; //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char*argv[]) { - - if (argc != 3) { - std::cout << "Usage: " << argv[0] << " #m #n"; - exit(1); - } - - int m, n, i; - - //Initializing the matrix dimensions - m = atoi(argv[1]); - n = atoi(argv[2]); - - double time = 0; - timer_start(); - - //Vector for all futures for the data management - std::vector> data_futures; - - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - double *A, *B, *C; - double *A_data; - int *A_indices, *A_pointers; - - double alpha; - - //Initializing values of alpha and beta - alpha = 1.0; - - //Malloc Host - cudaMallocHost((void**) &A, m * n * sizeof(double)); - checkCudaError("svmp malloc A"); - int count = 0; - //Input can be anything sparse - for (i = 0; i < (m * n); i++) { - if ((i % n) == 0) { - A[i] = (double) (i + 1); - count++; - } - } - - cudaMallocHost((void**) &B, n * 1 * sizeof(double)); - checkCudaError("svmp malloc B"); - cudaMallocHost((void**) &C, m * 1 * sizeof(double)); - checkCudaError("svmp malloc C"); - cudaMallocHost((void**) &A_data, count * sizeof(double)); - checkCudaError("svmp malloc A_data"); - cudaMallocHost((void**) &A_indices, count * sizeof(int)); - checkCudaError("svmp malloc A_pointers"); - cudaMallocHost((void**) &A_pointers, m * sizeof(int)); - - for (i = 0; i < (1 * n); i++) { - B[i] = (double) (-i - 1); - } - - for (i = 0; i < (m * 1); i++) { - C[i] = 0.0; - } - - //Counters for compression - int data_counter = 0; - int index_counter = 0; - int pointer_counter = -1; - - //Compressing Matrix A - for (i = 0; i < (m * n); i++) { - if (A[i] != 0) { - A_data[data_counter++] = A[i]; - if (((int) i / n) != pointer_counter) - A_pointers[++pointer_counter] = index_counter; - A_indices[index_counter++] = (i % n); - } - } - - //creating vector of futures - std::vector> kernelFutures; - - std::vector args; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; - - block.x = 32; - block.y = 1; - block.z = 1; - - grid.x = 1 + std::ceil(m / block.x); - grid.y = 1; - grid.z = 1; - - std::vector> progBuildVector; - std::vector progVector; - std::vector deviceVector; - - //Creating the first device found - device cudaDevice = devices[0]; - - //Create a Mandelbrot device program - hpx::lcos::future < program > fProg = cudaDevice.create_program_with_file( - "smvp.cu"); - - //Compile with the kernal - std::vector < std::string > flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); - - flags.push_back(mode); - - program prog = fProg.get(); - progBuildVector.push_back(prog.build(flags, "smvp")); - progVector.push_back(prog); - deviceVector.push_back(cudaDevice); - - //creating buffers - hpx::lcos::future fADataBuffer = cudaDevice.create_buffer( - count * sizeof(double)); - hpx::lcos::future fAIndexBuffer = cudaDevice.create_buffer( - count * sizeof(int)); - hpx::lcos::future fAPointerBuffer = cudaDevice.create_buffer( - m * sizeof(int)); - - hpx::lcos::future fBBuffer = cudaDevice.create_buffer( - n * 1 * sizeof(double)); - hpx::lcos::future fCBuffer = cudaDevice.create_buffer( - m * 1 * sizeof(double)); - hpx::lcos::future falphaBuffer = cudaDevice.create_buffer( - sizeof(double)); - hpx::lcos::future fmBuffer = cudaDevice.create_buffer(sizeof(int)); - hpx::lcos::future fnBuffer = cudaDevice.create_buffer(sizeof(int)); - hpx::lcos::future fcountBuffer = cudaDevice.create_buffer( - sizeof(int)); - - buffer ADataBuffer = fADataBuffer.get(); - buffer AIndexBuffer = fAIndexBuffer.get(); - buffer APointerBuffer = fAPointerBuffer.get(); - - buffer BBuffer = fBBuffer.get(); - buffer CBuffer = fCBuffer.get(); - buffer alphaBuffer = falphaBuffer.get(); - buffer mBuffer = fmBuffer.get(); - buffer nBuffer = fnBuffer.get(); - buffer countBuffer = fcountBuffer.get(); - - data_futures.push_back( - ADataBuffer.enqueue_write(0, count * sizeof(double), A_data)); - data_futures.push_back( - AIndexBuffer.enqueue_write(0, count * sizeof(int), A_indices)); - data_futures.push_back( - APointerBuffer.enqueue_write(0, m * sizeof(int), A_pointers)); - - data_futures.push_back(BBuffer.enqueue_write(0, n * sizeof(double), B)); - data_futures.push_back(CBuffer.enqueue_write(0, m * sizeof(double), C)); - data_futures.push_back(mBuffer.enqueue_write(0, sizeof(int), &m)); - data_futures.push_back(nBuffer.enqueue_write(0, sizeof(int), &n)); - data_futures.push_back(countBuffer.enqueue_write(0, sizeof(int), &count)); - data_futures.push_back( - alphaBuffer.enqueue_write(0, sizeof(double), &alpha)); - - args.push_back(ADataBuffer); - args.push_back(AIndexBuffer); - args.push_back(APointerBuffer); - - args.push_back(BBuffer); - args.push_back(CBuffer); - args.push_back(mBuffer); - args.push_back(nBuffer); - args.push_back(countBuffer); - args.push_back(alphaBuffer); - - //wait for program to build on all devices - hpx::wait_all(progBuildVector); - - //Synchronize data transfer before new data is written - hpx::wait_all(args); - - //Synchronize copy to buffer - hpx::wait_all(data_futures); - - //run the program on the device +int main(int argc, char *argv[]) { + if (argc != 3) { + std::cout << "Usage: " << argv[0] << " #m #n"; + exit(1); + } + + int m, n, i; + + // Initializing the matrix dimensions + m = atoi(argv[1]); + n = atoi(argv[2]); + + double time = 0; + timer_start(); + + // Vector for all futures for the data management + std::vector> data_futures; + + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } + + double *A, *B, *C; + double *A_data; + int *A_indices, *A_pointers; + + double alpha; + + // Initializing values of alpha and beta + alpha = 1.0; + + // Malloc Host + cudaMallocHost((void **)&A, m * n * sizeof(double)); + checkCudaError("svmp malloc A"); + int count = 0; + // Input can be anything sparse + for (i = 0; i < (m * n); i++) { + if ((i % n) == 0) { + A[i] = (double)(i + 1); + count++; + } + } + + cudaMallocHost((void **)&B, n * 1 * sizeof(double)); + checkCudaError("svmp malloc B"); + cudaMallocHost((void **)&C, m * 1 * sizeof(double)); + checkCudaError("svmp malloc C"); + cudaMallocHost((void **)&A_data, count * sizeof(double)); + checkCudaError("svmp malloc A_data"); + cudaMallocHost((void **)&A_indices, count * sizeof(int)); + checkCudaError("svmp malloc A_pointers"); + cudaMallocHost((void **)&A_pointers, m * sizeof(int)); + + for (i = 0; i < (1 * n); i++) { + B[i] = (double)(-i - 1); + } + + for (i = 0; i < (m * 1); i++) { + C[i] = 0.0; + } + + // Counters for compression + int data_counter = 0; + int index_counter = 0; + int pointer_counter = -1; + + // Compressing Matrix A + for (i = 0; i < (m * n); i++) { + if (A[i] != 0) { + A_data[data_counter++] = A[i]; + if (((int)i / n) != pointer_counter) + A_pointers[++pointer_counter] = index_counter; + A_indices[index_counter++] = (i % n); + } + } + + // creating vector of futures + std::vector> kernelFutures; + + std::vector args; + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + block.x = 32; + block.y = 1; + block.z = 1; + + grid.x = 1 + std::ceil(m / block.x); + grid.y = 1; + grid.z = 1; + + std::vector> progBuildVector; + std::vector progVector; + std::vector deviceVector; + + // Creating the first device found + device cudaDevice = devices[0]; + + // Create a Mandelbrot device program + hpx::lcos::future fProg = + cudaDevice.create_program_with_file("smvp.cu"); + + // Compile with the kernal + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); + + flags.push_back(mode); + + program prog = fProg.get(); + progBuildVector.push_back(prog.build(flags, "smvp")); + progVector.push_back(prog); + deviceVector.push_back(cudaDevice); + + // creating buffers + hpx::lcos::future fADataBuffer = + cudaDevice.create_buffer(count * sizeof(double)); + hpx::lcos::future fAIndexBuffer = + cudaDevice.create_buffer(count * sizeof(int)); + hpx::lcos::future fAPointerBuffer = + cudaDevice.create_buffer(m * sizeof(int)); + + hpx::lcos::future fBBuffer = + cudaDevice.create_buffer(n * 1 * sizeof(double)); + hpx::lcos::future fCBuffer = + cudaDevice.create_buffer(m * 1 * sizeof(double)); + hpx::lcos::future falphaBuffer = + cudaDevice.create_buffer(sizeof(double)); + hpx::lcos::future fmBuffer = cudaDevice.create_buffer(sizeof(int)); + hpx::lcos::future fnBuffer = cudaDevice.create_buffer(sizeof(int)); + hpx::lcos::future fcountBuffer = + cudaDevice.create_buffer(sizeof(int)); + + buffer ADataBuffer = fADataBuffer.get(); + buffer AIndexBuffer = fAIndexBuffer.get(); + buffer APointerBuffer = fAPointerBuffer.get(); + + buffer BBuffer = fBBuffer.get(); + buffer CBuffer = fCBuffer.get(); + buffer alphaBuffer = falphaBuffer.get(); + buffer mBuffer = fmBuffer.get(); + buffer nBuffer = fnBuffer.get(); + buffer countBuffer = fcountBuffer.get(); + + data_futures.push_back( + ADataBuffer.enqueue_write(0, count * sizeof(double), A_data)); + data_futures.push_back( + AIndexBuffer.enqueue_write(0, count * sizeof(int), A_indices)); + data_futures.push_back( + APointerBuffer.enqueue_write(0, m * sizeof(int), A_pointers)); + + data_futures.push_back(BBuffer.enqueue_write(0, n * sizeof(double), B)); + data_futures.push_back(CBuffer.enqueue_write(0, m * sizeof(double), C)); + data_futures.push_back(mBuffer.enqueue_write(0, sizeof(int), &m)); + data_futures.push_back(nBuffer.enqueue_write(0, sizeof(int), &n)); + data_futures.push_back(countBuffer.enqueue_write(0, sizeof(int), &count)); + data_futures.push_back(alphaBuffer.enqueue_write(0, sizeof(double), &alpha)); + + args.push_back(ADataBuffer); + args.push_back(AIndexBuffer); + args.push_back(APointerBuffer); + + args.push_back(BBuffer); + args.push_back(CBuffer); + args.push_back(mBuffer); + args.push_back(nBuffer); + args.push_back(countBuffer); + args.push_back(alphaBuffer); + + // wait for program to build on all devices + hpx::wait_all(progBuildVector); + + // Synchronize data transfer before new data is written + hpx::wait_all(args); + + // Synchronize copy to buffer + hpx::wait_all(data_futures); + + // run the program on the device #ifdef HPXCL_CUDA_WITH_STREAMS - kernelFutures.push_back(prog.run(args, "smvp", grid, block, 0)); + kernelFutures.push_back(prog.run(args, "smvp", grid, block, 0)); #else - kernelFutures.push_back(prog.run(args, "smvp", grid, block,0)); + kernelFutures.push_back(prog.run(args, "smvp", grid, block, 0)); #endif - //wait for all the kernal futures to return - hpx::wait_all(kernelFutures); - - //Free Memory - args.clear(); - - double* res = CBuffer.enqueue_read_sync(0, m * sizeof(double)); - - //Printing the end timing result - time += timer_stop(); - std::cout << time << " "; - - // Validating the result - std::cout << validateSmvp(A_data, A_indices, A_pointers, B, res, &m, &n, &count, &alpha) << std::endl; - - - cudaFreeHost(A); - checkCudaError("svmp free A"); - cudaFreeHost(B); - checkCudaError("svmp free B"); - cudaFreeHost(C); - checkCudaError("svmp free C"); - cudaFreeHost(A_data); - checkCudaError("svmp free A_data"); - cudaFreeHost(A_indices); - checkCudaError("svmp free A_indices"); - cudaFreeHost(A_pointers); - checkCudaError("svmp free A_pointers"); - - return 0; + // wait for all the kernal futures to return + hpx::wait_all(kernelFutures); + + // Free Memory + args.clear(); + + double *res = CBuffer.enqueue_read_sync(0, m * sizeof(double)); + + // Printing the end timing result + time += timer_stop(); + std::cout << time << " "; + + // Validating the result + std::cout << validateSmvp(A_data, A_indices, A_pointers, B, res, &m, &n, + &count, &alpha) + << std::endl; + + cudaFreeHost(A); + checkCudaError("svmp free A"); + cudaFreeHost(B); + checkCudaError("svmp free B"); + cudaFreeHost(C); + checkCudaError("svmp free C"); + cudaFreeHost(A_data); + checkCudaError("svmp free A_data"); + cudaFreeHost(A_indices); + checkCudaError("svmp free A_indices"); + cudaFreeHost(A_pointers); + checkCudaError("svmp free A_pointers"); + + return 0; } diff --git a/benchmark/cuda/stencil/PartitionHPX.cpp b/benchmark/cuda/stencil/PartitionHPX.cpp index 03faa36c..ae5cbe0c 100644 --- a/benchmark/cuda/stencil/PartitionHPX.cpp +++ b/benchmark/cuda/stencil/PartitionHPX.cpp @@ -17,153 +17,152 @@ using namespace hpx::cuda; //########################################################################### -//Kernels +// Kernels //########################################################################### static const char kernel_src[] = -"extern \"C\" __global__ void kernel(float* in) { \n" - " \n" - " size_t i = threadIdx.x + blockIdx.x * blockDim.x; \n" - " float x = (float) i; \n" - " float s = sinf(x); \n" - " float c = cosf(x); \n" - " in[i] = in[i] + sqrtf(s * s + c * c); \n" - " \n" - "} \n"; + "extern \"C\" __global__ void kernel(float* in) { " + " \n" + " " + " \n" + " size_t i = threadIdx.x + blockIdx.x * blockDim.x; " + " \n" + " float x = (float) i; " + " \n" + " float s = sinf(x); " + " \n" + " float c = cosf(x); " + " \n" + " in[i] = in[i] + sqrtf(s * s + c * c); " + " \n" + " " + " \n" + "} " + " \n"; //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char*argv[]) { - - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - const int blockSize = 256, nStreams = 4; - - if (argc != 2) { - std::cout << "Usage: " << argv[0] << " n -> 2^n*1024*" << blockSize - << "*" << nStreams << " elements" << std::endl; - exit(1); - } - - double time = 0; - size_t count = atoi(argv[1]); +int main(int argc, char* argv[]) { + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - const int n = pow(2,count) * 1024 * blockSize * nStreams; - const int streamSize = n / nStreams; - const int streamBytes = streamSize * sizeof(TYPE); - const int bytes = n * sizeof(TYPE); - - std::cout << n << " "; + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - timer_start(); + const int blockSize = 256, nStreams = 4; - //Malloc Host - TYPE* in; - cudaMallocHost((void**) &in, bytes); - checkCudaError("Malloc in"); - memset(in, 0, bytes); + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " n -> 2^n*1024*" << blockSize << "*" + << nStreams << " elements" << std::endl; + exit(1); + } - // Create a device component from the first device found - device cudaDevice = devices[0]; + double time = 0; + size_t count = atoi(argv[1]); - std::vector> dependencies; + const int n = pow(2, count) * 1024 * blockSize * nStreams; + const int streamSize = n / nStreams; + const int streamBytes = streamSize * sizeof(TYPE); + const int bytes = n * sizeof(TYPE); - // Create the hello_world device program - program prog = cudaDevice.create_program_with_source(kernel_src).get(); - - // Add compiler flags for compiling the kernel + std::cout << n << " "; - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); + timer_start(); - flags.push_back(mode); + // Malloc Host + TYPE* in; + cudaMallocHost((void**)&in, bytes); + checkCudaError("Malloc in"); + memset(in, 0, bytes); - dependencies.push_back(prog.build(flags, "kernel")); + // Create a device component from the first device found + device cudaDevice = devices[0]; - std::vector> fbuffer; - for (size_t i = 0; i < nStreams; i++) - { - fbuffer.push_back(cudaDevice.create_buffer(streamBytes)); - } - - hpx::wait_all(fbuffer); + std::vector> dependencies; - std::vector bufferIn; - for (size_t i = 0; i < nStreams; i++) - { - bufferIn.push_back(fbuffer[i].get()); - } + // Create the hello_world device program + program prog = cudaDevice.create_program_with_source(kernel_src).get(); - for (size_t i = 0; i < nStreams; i++) - { + // Add compiler flags for compiling the kernel - dependencies.push_back(bufferIn[i].enqueue_write(i*streamSize,streamBytes,in)); - } - - std::vector args; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); - //Set the values for the grid dimension - grid.x = streamSize / blockSize; - grid.y = 1; - grid.z = 1; + flags.push_back(mode); - //Set the values for the block dimension - block.x = blockSize; - block.y = 1; - block.z = 1; + dependencies.push_back(prog.build(flags, "kernel")); - hpx::wait_all(dependencies); + std::vector> fbuffer; + for (size_t i = 0; i < nStreams; i++) { + fbuffer.push_back(cudaDevice.create_buffer(streamBytes)); + } - std::vector> kernelFutures; - for (size_t i = 0; i < nStreams; i++) - { - args.push_back(bufferIn[i]); - #ifdef HPXCL_CUDA_WITH_STREAMS - kernelFutures.push_back(prog.run(args, "kernel", grid, block,0)); - #else - kernelFutures.push_back(prog.run(args, "kernel", grid, block,0)); - #endif - args.clear(); - } - - hpx::wait_all(kernelFutures); - - time += timer_stop(); + hpx::wait_all(fbuffer); - bool check; - for (size_t i = 0; i < nStreams; i++) - { - TYPE* res = bufferIn[i].enqueue_read_sync(0,streamBytes); - check = checkKernel(res,streamSize); - if(check==false) break; - - } - - timer_start(); - - //Clean - cudaFreeHost(in); - checkCudaError("Free in"); - - std:: cout << check << " " << time + timer_stop() << std::endl; - - return EXIT_SUCCESS; -} + std::vector bufferIn; + for (size_t i = 0; i < nStreams; i++) { + bufferIn.push_back(fbuffer[i].get()); + } + + for (size_t i = 0; i < nStreams; i++) { + dependencies.push_back( + bufferIn[i].enqueue_write(i * streamSize, streamBytes, in)); + } + + std::vector args; + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + // Set the values for the grid dimension + grid.x = streamSize / blockSize; + grid.y = 1; + grid.z = 1; + + // Set the values for the block dimension + block.x = blockSize; + block.y = 1; + block.z = 1; + + hpx::wait_all(dependencies); + + std::vector> kernelFutures; + for (size_t i = 0; i < nStreams; i++) { + args.push_back(bufferIn[i]); +#ifdef HPXCL_CUDA_WITH_STREAMS + kernelFutures.push_back(prog.run(args, "kernel", grid, block, 0)); +#else + kernelFutures.push_back(prog.run(args, "kernel", grid, block, 0)); +#endif + args.clear(); + } + hpx::wait_all(kernelFutures); + + time += timer_stop(); + + bool check; + for (size_t i = 0; i < nStreams; i++) { + TYPE* res = bufferIn[i].enqueue_read_sync(0, streamBytes); + check = checkKernel(res, streamSize); + if (check == false) break; + } + + timer_start(); + + // Clean + cudaFreeHost(in); + checkCudaError("Free in"); + + std::cout << check << " " << time + timer_stop() << std::endl; + + return EXIT_SUCCESS; +} diff --git a/benchmark/cuda/stencil/PartitionHPXMultiDevice.cpp b/benchmark/cuda/stencil/PartitionHPXMultiDevice.cpp index e3ce547e..28268f47 100644 --- a/benchmark/cuda/stencil/PartitionHPXMultiDevice.cpp +++ b/benchmark/cuda/stencil/PartitionHPXMultiDevice.cpp @@ -17,172 +17,165 @@ using namespace hpx::cuda; //########################################################################### -//Kernels +// Kernels //########################################################################### static const char kernel_src[] = -"extern \"C\" __global__ void kernel(float* in) { \n" - " \n" - " size_t i = threadIdx.x + blockIdx.x * blockDim.x; \n" - " float x = (float) i; \n" - " float s = sinf(x); \n" - " float c = cosf(x); \n" - " in[i] = in[i] + sqrtf(s * s + c * c); \n" - " \n" - "} \n"; + "extern \"C\" __global__ void kernel(float* in) { " + " \n" + " " + " \n" + " size_t i = threadIdx.x + blockIdx.x * blockDim.x; " + " \n" + " float x = (float) i; " + " \n" + " float s = sinf(x); " + " \n" + " float c = cosf(x); " + " \n" + " in[i] = in[i] + sqrtf(s * s + c * c); " + " \n" + " " + " \n" + "} " + " \n"; //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char*argv[]) { +int main(int argc, char* argv[]) { + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } + const int blockSize = 256; + + if (argc != 3) { + std::cout << "Usage: " << argv[0] << " n -> 2^n*1024*" << blockSize + << " elements and g -> amount of GPUs" << std::endl; + exit(1); + } - const int blockSize = 256; + double time = 0; + size_t count = atoi(argv[1]); + int deviceCount = atoi(argv[2]); + if (deviceCount > devices.size()) { + std::cout << "Error: You are using " << deviceCount << " and only " + << devices.size() << "are available" << std::endl; + exit(1); + } - if (argc != 3) { - std::cout << "Usage: " << argv[0] << " n -> 2^n*1024*" << blockSize - << " elements and g -> amount of GPUs" << std::endl; - exit(1); - } + const int n = pow(2, count) * 1024 * blockSize; + const int streamSize = n / deviceCount; + const int streamBytes = streamSize * sizeof(TYPE); + const int bytes = n * sizeof(TYPE); - double time = 0; - size_t count = atoi(argv[1]); - int deviceCount = atoi(argv[2]); - - if( deviceCount > devices.size()) - { - std::cout << "Error: You are using " << deviceCount << " and only " - << devices.size() << "are available" << std::endl; - exit(1); - } + std::cout << n << " "; + timer_start(); - const int n = pow(2,count) * 1024 * blockSize; - const int streamSize = n / deviceCount; - const int streamBytes = streamSize * sizeof(TYPE); - const int bytes = n * sizeof(TYPE); + // Malloc Host + TYPE* in; + cudaMallocHost((void**)&in, bytes); + checkCudaError("Malloc in"); + for (size_t i = 0; i < n; i++) in[i] = 0.; - std::cout << n << " "; + std::vector> dependencies; - timer_start(); + // Create the hello_world device program + std::vector> fprogram; - //Malloc Host - TYPE* in; - cudaMallocHost((void**) &in, bytes); - checkCudaError("Malloc in"); - for (size_t i = 0; i < n ; i++) - in[i] = 0.; + for (size_t i = 0; i < deviceCount; i++) + fprogram.push_back(devices[i].create_program_with_source(kernel_src)); - std::vector> dependencies; + // Add compiler flags for compiling the kernel - // Create the hello_world device program - std::vector> fprogram; + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(devices[0].get_device_architecture_major().get())); + mode.append(std::to_string(devices[0].get_device_architecture_minor().get())); - for (size_t i = 0; i < deviceCount; i++) - fprogram.push_back(devices[i].create_program_with_source(kernel_src)); + flags.push_back(mode); - // Add compiler flags for compiling the kernel + hpx::wait_all(fprogram); - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(devices[0].get_device_architecture_major().get())); - mode.append( - std::to_string(devices[0].get_device_architecture_minor().get())); + std::vector programs; + for (size_t i = 0; i < deviceCount; i++) { + programs.push_back(fprogram[i].get()); + dependencies.push_back(programs[i].build(flags, "kernel")); + } - flags.push_back(mode); + std::vector> fbuffer; + for (size_t i = 0; i < deviceCount; i++) { + fbuffer.push_back(devices[i].create_buffer(streamBytes)); + } - hpx::wait_all(fprogram); + hpx::wait_all(fbuffer); - std::vector programs; - for (size_t i = 0; i < deviceCount; i++) - { - programs.push_back(fprogram[i].get()); - dependencies.push_back(programs[i].build(flags, "kernel")); + std::vector bufferIn; + for (size_t i = 0; i < deviceCount; i++) { + bufferIn.push_back(fbuffer[i].get()); + } - } + for (size_t i = 0; i < deviceCount; i++) { + dependencies.push_back( + bufferIn[i].enqueue_write(i * streamSize, streamBytes, in)); + } - std::vector> fbuffer; - for (size_t i = 0; i < deviceCount; i++) - { - fbuffer.push_back(devices[i].create_buffer(streamBytes)); - } - - hpx::wait_all(fbuffer); + std::vector args; + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; - std::vector bufferIn; - for (size_t i = 0; i < deviceCount; i++) - { - bufferIn.push_back(fbuffer[i].get()); - } + // Set the values for the grid dimension + grid.x = streamSize / blockSize; + grid.y = 1; + grid.z = 1; - for (size_t i = 0; i < deviceCount; i++) - { + // Set the values for the block dimension + block.x = blockSize; + block.y = 1; + block.z = 1; - dependencies.push_back(bufferIn[i].enqueue_write(i*streamSize,streamBytes,in)); - } - - std::vector args; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; + hpx::wait_all(dependencies); - //Set the values for the grid dimension - grid.x = streamSize / blockSize; - grid.y = 1; - grid.z = 1; + std::vector> kernelFutures; + for (size_t i = 0; i < deviceCount; i++) { + args.push_back(bufferIn[i]); +#ifdef HPXCL_CUDA_WITH_STREAMS + kernelFutures.push_back(programs[i].run(args, "kernel", grid, block, 0)); +#else + kernelFutures.push_back(programs[i].run(args, "kernel", grid, block, 0)); +#endif + args.clear(); + } - //Set the values for the block dimension - block.x = blockSize; - block.y = 1; - block.z = 1; + hpx::wait_all(kernelFutures); - hpx::wait_all(dependencies); + time += timer_stop(); - std::vector> kernelFutures; - for (size_t i = 0; i < deviceCount; i++) - { - args.push_back(bufferIn[i]); - #ifdef HPXCL_CUDA_WITH_STREAMS - kernelFutures.push_back(programs[i].run(args, "kernel", grid, block,0)); - #else - kernelFutures.push_back(programs[i].run(args, "kernel", grid, block,0)); - #endif - args.clear(); - } + bool check; + for (size_t i = 0; i < deviceCount; i++) { + TYPE* res = bufferIn[i].enqueue_read_sync(0, streamBytes); + check = checkKernel(res, streamSize); + if (check == false) break; + } - hpx::wait_all(kernelFutures); + timer_start(); - time += timer_stop(); + // Clean + cudaFreeHost(in); + checkCudaError("Free in"); - bool check; - for (size_t i = 0; i < deviceCount; i++) - { - TYPE* res = bufferIn[i].enqueue_read_sync(0,streamBytes); - check = checkKernel(res,streamSize); - if(check==false) break; + std::cout << check << " " << time + timer_stop() << std::endl; - } - - timer_start(); - - //Clean - cudaFreeHost(in); - checkCudaError("Free in"); - - std:: cout << check << " " << time + timer_stop() << std::endl; - - return EXIT_SUCCESS; + return EXIT_SUCCESS; } - diff --git a/benchmark/cuda/stencil/StencilHPX.cpp b/benchmark/cuda/stencil/StencilHPX.cpp index 2dcc7869..5771b488 100644 --- a/benchmark/cuda/stencil/StencilHPX.cpp +++ b/benchmark/cuda/stencil/StencilHPX.cpp @@ -17,148 +17,144 @@ using namespace hpx::cuda; //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char*argv[]) { - - if (argc != 2) { - std::cout << "Usage: " << argv[0] << " #elements" << std::endl; - exit(1); - } - - double data = 0.; - - timer_start(); - size_t* count; - cudaMallocHost((void**)&count,sizeof(size_t)); - checkCudaError("Malloc count"); - count[0]= atoi(argv[1]); - - std::cout << count[0] << " "; - - //Vector for all futures for the data management - std::vector> data_futures; - - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(1, 0).get(); - data += timer_stop(); - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - //Pointer - TYPE* out; - TYPE* in; - TYPE* s; - - //Malloc Host - cudaMallocHost((void**) &out, count[0] * sizeof(TYPE)); - checkCudaError("Mallloc out"); - cudaMallocHost((void**) &in, count[0] * sizeof(TYPE)); - checkCudaError("Malloc in"); - cudaMallocHost((void**) &s, 3 * sizeof(TYPE)); - checkCudaError("Malloc s"); - - //Initialize the data - fillRandomVector(in, count[0]); - s[0] = 0.5; - s[1] = 1.; - s[2] = 0.5; - - // Create a device component from the first device found - device cudaDevice = devices[0]; - - // Create the hello_world device program - hpx::lcos::future < hpx::cuda::program > futureProg = cudaDevice.create_program_with_file( - "stencil_kernel.cu"); - - //Add compiler flags for compiling the kernel - std::vector < std::string > flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); - - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); - - flags.push_back(mode); - flags.push_back("-use_fast_math"); - - // Compile the program - hpx::cuda::program prog = futureProg.get(); - prog.build_sync(flags, "stencil"); - - // Create a buffer - std::vector> futureBuffers; - futureBuffers.push_back(cudaDevice.create_buffer(count[0] * sizeof(TYPE))); - futureBuffers.push_back(cudaDevice.create_buffer(3 * sizeof(TYPE))); - futureBuffers.push_back(cudaDevice.create_buffer(count[0] * sizeof(TYPE))); - futureBuffers.push_back(cudaDevice.create_buffer(sizeof(size_t))); - - hpx::wait_all(futureBuffers); - - buffer inBuffer = futureBuffers[0].get(); - buffer sBuffer = futureBuffers[1].get(); - buffer outBuffer = futureBuffers[2].get(); - buffer lengthbuffer = futureBuffers[3].get(); - - // Copy input data to the buffer - data_futures.push_back( - inBuffer.enqueue_write(0, count[0] * sizeof(TYPE), in)); - data_futures.push_back(sBuffer.enqueue_write(0, 3 * sizeof(TYPE), s)); - data_futures.push_back( - outBuffer.enqueue_write(0, count[0] * sizeof(TYPE), in)); - data_futures.push_back( - lengthbuffer.enqueue_write(0, sizeof(size_t), count)); - - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; - - //Set the values for the grid dimension - grid.x = 1; - grid.y = 0; - grid.z = 0; - - //Set the values for the block dimension - block.x = 32; - block.y = 0; - block.z = 0; - - //Launch kernels - std::vector args; - args.push_back(lengthbuffer); - args.push_back(inBuffer); - args.push_back(outBuffer); - args.push_back(sBuffer); - - data_futures.push_back(prog.run(args, "stencil", grid, block,0)); - - hpx::wait_all(data_futures); - - TYPE* res = outBuffer.enqueue_read_sync(0, sizeof(TYPE)); - - data += timer_stop(); - - //Check the result - std::cout << checkStencil(in, res, s, count[0]) << " "; - - timer_start(); - - //Cleanup - cudaFreeHost(in); - checkCudaError("Free in"); - cudaFreeHost(s); - checkCudaError("Free s"); - cudaFreeHost(out); - checkCudaError("Free out"); - cudaFreeHost(count); - checkCudaError("Free count"); - - std::cout << data + timer_stop() << std::endl; - - return EXIT_SUCCESS; +int main(int argc, char* argv[]) { + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " #elements" << std::endl; + exit(1); + } + + double data = 0.; + + timer_start(); + size_t* count; + cudaMallocHost((void**)&count, sizeof(size_t)); + checkCudaError("Malloc count"); + count[0] = atoi(argv[1]); + + std::cout << count[0] << " "; + + // Vector for all futures for the data management + std::vector> data_futures; + + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(1, 0).get(); + data += timer_stop(); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } + + // Pointer + TYPE* out; + TYPE* in; + TYPE* s; + + // Malloc Host + cudaMallocHost((void**)&out, count[0] * sizeof(TYPE)); + checkCudaError("Mallloc out"); + cudaMallocHost((void**)&in, count[0] * sizeof(TYPE)); + checkCudaError("Malloc in"); + cudaMallocHost((void**)&s, 3 * sizeof(TYPE)); + checkCudaError("Malloc s"); + + // Initialize the data + fillRandomVector(in, count[0]); + s[0] = 0.5; + s[1] = 1.; + s[2] = 0.5; + + // Create a device component from the first device found + device cudaDevice = devices[0]; + + // Create the hello_world device program + hpx::lcos::future futureProg = + cudaDevice.create_program_with_file("stencil_kernel.cu"); + + // Add compiler flags for compiling the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); + + flags.push_back(mode); + flags.push_back("-use_fast_math"); + + // Compile the program + hpx::cuda::program prog = futureProg.get(); + prog.build_sync(flags, "stencil"); + + // Create a buffer + std::vector> futureBuffers; + futureBuffers.push_back(cudaDevice.create_buffer(count[0] * sizeof(TYPE))); + futureBuffers.push_back(cudaDevice.create_buffer(3 * sizeof(TYPE))); + futureBuffers.push_back(cudaDevice.create_buffer(count[0] * sizeof(TYPE))); + futureBuffers.push_back(cudaDevice.create_buffer(sizeof(size_t))); + + hpx::wait_all(futureBuffers); + + buffer inBuffer = futureBuffers[0].get(); + buffer sBuffer = futureBuffers[1].get(); + buffer outBuffer = futureBuffers[2].get(); + buffer lengthbuffer = futureBuffers[3].get(); + + // Copy input data to the buffer + data_futures.push_back( + inBuffer.enqueue_write(0, count[0] * sizeof(TYPE), in)); + data_futures.push_back(sBuffer.enqueue_write(0, 3 * sizeof(TYPE), s)); + data_futures.push_back( + outBuffer.enqueue_write(0, count[0] * sizeof(TYPE), in)); + data_futures.push_back(lengthbuffer.enqueue_write(0, sizeof(size_t), count)); + + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + // Set the values for the grid dimension + grid.x = 1; + grid.y = 0; + grid.z = 0; + + // Set the values for the block dimension + block.x = 32; + block.y = 0; + block.z = 0; + + // Launch kernels + std::vector args; + args.push_back(lengthbuffer); + args.push_back(inBuffer); + args.push_back(outBuffer); + args.push_back(sBuffer); + + data_futures.push_back(prog.run(args, "stencil", grid, block, 0)); + + hpx::wait_all(data_futures); + + TYPE* res = outBuffer.enqueue_read_sync(0, sizeof(TYPE)); + + data += timer_stop(); + + // Check the result + std::cout << checkStencil(in, res, s, count[0]) << " "; + + timer_start(); + + // Cleanup + cudaFreeHost(in); + checkCudaError("Free in"); + cudaFreeHost(s); + checkCudaError("Free s"); + cudaFreeHost(out); + checkCudaError("Free out"); + cudaFreeHost(count); + checkCudaError("Free count"); + + std::cout << data + timer_stop() << std::endl; + + return EXIT_SUCCESS; } diff --git a/benchmark/cuda/stencil/config.hpp b/benchmark/cuda/stencil/config.hpp index 0c8b5f6f..7a7e0b63 100644 --- a/benchmark/cuda/stencil/config.hpp +++ b/benchmark/cuda/stencil/config.hpp @@ -7,7 +7,7 @@ #define EPS 10e-5 //########################################################################### -//Switching between single precision and double precision +// Switching between single precision and double precision //########################################################################### #ifdef SINGLE diff --git a/benchmark/cuda/stencil/utils.hpp b/benchmark/cuda/stencil/utils.hpp index 29261351..9bfcbe9f 100644 --- a/benchmark/cuda/stencil/utils.hpp +++ b/benchmark/cuda/stencil/utils.hpp @@ -9,42 +9,42 @@ #include #include -template +template void fillRandomVector(T* matrix, size_t size) { - srand(time(NULL)); - - for (size_t i = 0; i < size; i++) { - - matrix[i] = (T) (0.5) * ((T) rand()) / (T) RAND_MAX; - } + srand(time(NULL)); + for (size_t i = 0; i < size; i++) { + matrix[i] = (T)(0.5) * ((T)rand()) / (T)RAND_MAX; + } } -template -bool checkStencil(T*in, T*out, T* s, size_t size) { - bool check = true; - for (size_t i = 1; i < size - 1; ++i) { - T res = s[0] * in[i - 1] + s[1] * in[i] + s[2] * in[i + 1]; +template +bool checkStencil(T* in, T* out, T* s, size_t size) { + bool check = true; + for (size_t i = 1; i < size - 1; ++i) { + T res = s[0] * in[i - 1] + s[1] * in[i] + s[2] * in[i + 1]; - if (abs(res - out[i]) >= EPS) { - check = false; - break; - } - } + if (abs(res - out[i]) >= EPS) { + check = false; + break; + } + } - return check; + return check; } -template -bool checkKernel(T*in,size_t size) { - bool check = true; - for (size_t i = 0; i < size ; ++i) { - - T error = abs(in[i]-1.0f); - if (error > EPS) { check = false; break;} - } - - return check; +template +bool checkKernel(T* in, size_t size) { + bool check = true; + for (size_t i = 0; i < size; ++i) { + T error = abs(in[i] - 1.0f); + if (error > EPS) { + check = false; + break; + } + } + + return check; } #endif /* UTILS_HPP_ */ diff --git a/benchmark/cuda/stream/StreamHPX.cpp b/benchmark/cuda/stream/StreamHPX.cpp index 26bcd963..e34a1f09 100644 --- a/benchmark/cuda/stream/StreamHPX.cpp +++ b/benchmark/cuda/stream/StreamHPX.cpp @@ -14,472 +14,449 @@ using namespace hpx::cuda; #define VERBOSE /////////////////////////////////////////////////////////////////////////////// -double mysecond() { - return hpx::util::high_resolution_clock::now() * 1e-9; -} +double mysecond() { return hpx::util::high_resolution_clock::now() * 1e-9; } /////////////////////////////////////////////////////////////////////////////// int checktick() { - static const std::size_t M = 20; - int minDelta, Delta; - double t1, t2, timesfound[M]; - - // Collect a sequence of M unique time values from the system. - for (std::size_t i = 0; i < M; i++) { - t1 = mysecond(); - while (((t2 = mysecond()) - t1) < 1.0E-6) - ; - timesfound[i] = t1 = t2; - } - - // Determine the minimum difference between these M values. - // This result will be our estimate (in microseconds) for the - // clock granularity. - minDelta = 1000000; - for (std::size_t i = 1; i < M; i++) { - Delta = (int) (1.0E6 * (timesfound[i] - timesfound[i - 1])); - minDelta = (std::min)(minDelta, (std::max)(Delta, 0)); - } - - return (minDelta); + static const std::size_t M = 20; + int minDelta, Delta; + double t1, t2, timesfound[M]; + + // Collect a sequence of M unique time values from the system. + for (std::size_t i = 0; i < M; i++) { + t1 = mysecond(); + while (((t2 = mysecond()) - t1) < 1.0E-6) + ; + timesfound[i] = t1 = t2; + } + + // Determine the minimum difference between these M values. + // This result will be our estimate (in microseconds) for the + // clock granularity. + minDelta = 1000000; + for (std::size_t i = 1; i < M; i++) { + Delta = (int)(1.0E6 * (timesfound[i] - timesfound[i - 1])); + minDelta = (std::min)(minDelta, (std::max)(Delta, 0)); + } + + return (minDelta); } /////////////////////////////////////////////////////////////////////////////// -void check_results(size_t iterations, size_t size, double* a, - double* b, double* c) { - - double aj, bj, cj, scalar; - double aSumErr, bSumErr, cSumErr; - double aAvgErr, bAvgErr, cAvgErr; - double epsilon; - int ierr, err; - - /* reproduce initialization */ - aj = 1.0; - bj = 2.0; - cj = 0.0; - /* a[] is modified during timing check */ - aj = 2.0E0 * aj; - /* now execute timing loop */ - scalar = 3.0; - for (std::size_t k = 0; k < iterations; k++) { - cj = aj; - bj = scalar * cj; - cj = aj + bj; - aj = bj + scalar * cj; - } - - /* accumulate deltas between observed and expected results */ - aSumErr = 0.0; - bSumErr = 0.0; - cSumErr = 0.0; - for (std::size_t j = 0; j < size; j++) { - aSumErr += std::abs(a[j] - aj); - bSumErr += std::abs(b[j] - bj); - cSumErr += std::abs(c[j] - cj); - // if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // MCCALPIN - } - aAvgErr = aSumErr / (double) size; - bAvgErr = bSumErr / (double) size; - cAvgErr = cSumErr / (double) size; - - if (sizeof(double) == 4) { - epsilon = 1.e-6; - } else if (sizeof(double) == 8) { - epsilon = 1.e-13; - } else { - printf("WEIRD: sizeof(STREAM_TYPE) = %zu\n", sizeof(double)); - epsilon = 1.e-6; - } - - err = 0; - if (std::abs(aAvgErr / aj) > epsilon) { - err++; - printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", - epsilon); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", aj, - aAvgErr, std::abs(aAvgErr) / aj); - ierr = 0; - for (std::size_t j = 0; j < size; j++) { - if (std::abs(a[j] / aj - 1.0) > epsilon) { - ierr++; +void check_results(size_t iterations, size_t size, double* a, double* b, + double* c) { + double aj, bj, cj, scalar; + double aSumErr, bSumErr, cSumErr; + double aAvgErr, bAvgErr, cAvgErr; + double epsilon; + int ierr, err; + + /* reproduce initialization */ + aj = 1.0; + bj = 2.0; + cj = 0.0; + /* a[] is modified during timing check */ + aj = 2.0E0 * aj; + /* now execute timing loop */ + scalar = 3.0; + for (std::size_t k = 0; k < iterations; k++) { + cj = aj; + bj = scalar * cj; + cj = aj + bj; + aj = bj + scalar * cj; + } + + /* accumulate deltas between observed and expected results */ + aSumErr = 0.0; + bSumErr = 0.0; + cSumErr = 0.0; + for (std::size_t j = 0; j < size; j++) { + aSumErr += std::abs(a[j] - aj); + bSumErr += std::abs(b[j] - bj); + cSumErr += std::abs(c[j] - cj); + // if (j == 417) printf("Index 417: c[j]: %f, cj: %f\n",c[j],cj); // + // MCCALPIN + } + aAvgErr = aSumErr / (double)size; + bAvgErr = bSumErr / (double)size; + cAvgErr = cSumErr / (double)size; + + if (sizeof(double) == 4) { + epsilon = 1.e-6; + } else if (sizeof(double) == 8) { + epsilon = 1.e-13; + } else { + printf("WEIRD: sizeof(STREAM_TYPE) = %zu\n", sizeof(double)); + epsilon = 1.e-6; + } + + err = 0; + if (std::abs(aAvgErr / aj) > epsilon) { + err++; + printf("Failed Validation on array a[], AvgRelAbsErr > epsilon (%e)\n", + epsilon); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", aj, + aAvgErr, std::abs(aAvgErr) / aj); + ierr = 0; + for (std::size_t j = 0; j < size; j++) { + if (std::abs(a[j] / aj - 1.0) > epsilon) { + ierr++; #ifdef VERBOSE - if (ierr < 10) { - printf(" array a: index: %ld, expected: %e, " - "observed: %e, relative error: %e\n", - j,aj,a[j],std::abs((aj-a[j])/aAvgErr)); - } + if (ierr < 10) { + printf( + " array a: index: %ld, expected: %e, " + "observed: %e, relative error: %e\n", + j, aj, a[j], std::abs((aj - a[j]) / aAvgErr)); + } #endif - } - } - printf(" For array a[], %d errors were found.\n", ierr); - } - if (std::abs(bAvgErr / bj) > epsilon) { - err++; - printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", - epsilon); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", bj, - bAvgErr, std::abs(bAvgErr) / bj); - printf(" AvgRelAbsErr > Epsilon (%e)\n", epsilon); - ierr = 0; - for (std::size_t j = 0; j < size; j++) { - if (std::abs(b[j] / bj - 1.0) > epsilon) { - ierr++; + } + } + printf(" For array a[], %d errors were found.\n", ierr); + } + if (std::abs(bAvgErr / bj) > epsilon) { + err++; + printf("Failed Validation on array b[], AvgRelAbsErr > epsilon (%e)\n", + epsilon); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", bj, + bAvgErr, std::abs(bAvgErr) / bj); + printf(" AvgRelAbsErr > Epsilon (%e)\n", epsilon); + ierr = 0; + for (std::size_t j = 0; j < size; j++) { + if (std::abs(b[j] / bj - 1.0) > epsilon) { + ierr++; #ifdef VERBOSE - if (ierr < 10) { - printf(" array b: index: %ld, expected: %e, " - "observed: %e, relative error: %e\n", - j,bj,b[j],std::abs((bj-b[j])/bAvgErr)); - } + if (ierr < 10) { + printf( + " array b: index: %ld, expected: %e, " + "observed: %e, relative error: %e\n", + j, bj, b[j], std::abs((bj - b[j]) / bAvgErr)); + } #endif - } - } - printf(" For array b[], %d errors were found.\n", ierr); - } - if (std::abs(cAvgErr / cj) > epsilon) { - err++; - printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", - epsilon); - printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", cj, - cAvgErr, std::abs(cAvgErr) / cj); - printf(" AvgRelAbsErr > Epsilon (%e)\n", epsilon); - ierr = 0; - for (std::size_t j = 0; j < size; j++) { - if (std::abs(c[j] / cj - 1.0) > epsilon) { - ierr++; + } + } + printf(" For array b[], %d errors were found.\n", ierr); + } + if (std::abs(cAvgErr / cj) > epsilon) { + err++; + printf("Failed Validation on array c[], AvgRelAbsErr > epsilon (%e)\n", + epsilon); + printf(" Expected Value: %e, AvgAbsErr: %e, AvgRelAbsErr: %e\n", cj, + cAvgErr, std::abs(cAvgErr) / cj); + printf(" AvgRelAbsErr > Epsilon (%e)\n", epsilon); + ierr = 0; + for (std::size_t j = 0; j < size; j++) { + if (std::abs(c[j] / cj - 1.0) > epsilon) { + ierr++; #ifdef VERBOSE - if (ierr < 10) { - printf(" array c: index: %ld, expected: %e, " - "observed: %e, relative error: %e\n", - j,cj,c[j],std::abs((cj-c[j])/cAvgErr)); - } + if (ierr < 10) { + printf( + " array c: index: %ld, expected: %e, " + "observed: %e, relative error: %e\n", + j, cj, c[j], std::abs((cj - c[j]) / cAvgErr)); + } #endif - } - } - printf(" For array c[], %d errors were found.\n", ierr); - } - if (err == 0) { - printf( - "Solution Validates: avg error less than %e on all three arrays\n", - epsilon); - } + } + } + printf(" For array c[], %d errors were found.\n", ierr); + } + if (err == 0) { + printf("Solution Validates: avg error less than %e on all three arrays\n", + epsilon); + } #ifdef VERBOSE - printf ("Results Validation Verbose Results: \n"); - printf (" Expected a(1), b(1), c(1): %f %f %f \n",aj,bj,cj); - printf (" Observed a(1), b(1), c(1): %f %f %f \n",a[1],b[1],c[1]); - printf (" Rel Errors on a, b, c: %e %e %e \n",std::abs(aAvgErr/aj), - std::abs(bAvgErr/bj),std::abs(cAvgErr/cj)); + printf("Results Validation Verbose Results: \n"); + printf(" Expected a(1), b(1), c(1): %f %f %f \n", aj, bj, cj); + printf(" Observed a(1), b(1), c(1): %f %f %f \n", a[1], b[1], c[1]); + printf(" Rel Errors on a, b, c: %e %e %e \n", std::abs(aAvgErr / aj), + std::abs(bAvgErr / bj), std::abs(cAvgErr / cj)); #endif - } /////////////////////////////////////////////////////////////////////////////// -std::vector > run_benchmark(size_t iterations, - size_t size) { - - //Vector for all futures for the data management - std::vector> futures; - - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(1, 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - } - - //Get the cuda device - device cudaDevice = devices[0]; - - //Compile the kernels - - // Create the hello_world device program - program prog = cudaDevice.create_program_with_file("kernels.cu").get(); - - //Add compiler flags for compiling the kernel - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); - - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); - - flags.push_back(mode); - flags.push_back("-use_fast_math"); - - std::vector kernels; - kernels.push_back("multiply_step"); - kernels.push_back("add_step"); - kernels.push_back("triad_step"); - - // Compile the program - hpx::wait_all(prog); - auto fProg = prog.build(flags, kernels); - - //Fill the vector - double* a; - double* b; - double* c; - size_t* s; - double* factor; - - cudaMallocHost((void**) &a, sizeof(double) * size); - checkCudaError("Malloc a"); - cudaMallocHost((void**) &b, sizeof(double) * size); - checkCudaError("Malloc b"); - cudaMallocHost((void**) &c, sizeof(double) * size); - checkCudaError("Malloc c"); - cudaMallocHost((void**) &s, sizeof(size_t)); - checkCudaError("Malloc s"); - cudaMallocHost((void**) &factor, sizeof(double)); - checkCudaError("Malloc factor"); - s[0] = size; - factor[0] = 2.; - - for(size_t i = 0 ; i < size ; i++) - { - a[i] = 1.0; - b[i] = 2.0; - c[i] = 0.0; - } - - //Allocate device buffer - hpx::lcos::future faBuffer = cudaDevice.create_buffer(size * sizeof(double)); - hpx::lcos::future fbBuffer = cudaDevice.create_buffer(size * sizeof(double)); - hpx::lcos::future fcBuffer = cudaDevice.create_buffer(size * sizeof(double)); - hpx::lcos::future fsizeBuffer = cudaDevice.create_buffer(sizeof(size_t)); - hpx::lcos::future ffBuffer = cudaDevice.create_buffer(sizeof(double)); - - //Fill device buffer - //hpx::wait_all(faBuffer); - buffer aBuffer = faBuffer.get(); - auto fa = aBuffer.enqueue_write(0, size * sizeof(double), a); - //hpx::wait_all(bBuffer); - buffer bBuffer = fbBuffer.get(); - auto fb = bBuffer.enqueue_write(0, size * sizeof(double), b); - //hpx::wait_all(cBuffer); - buffer cBuffer = fcBuffer.get(); - auto fc = cBuffer.enqueue_write(0, size * sizeof(double), c); - //hpx::wait_all(sizeBuffer); - buffer sizeBuffer = fsizeBuffer.get(); - auto fsize = sizeBuffer.enqueue_write(0, sizeof(size_t), s); - //hpx::wait_all(fBuffer); - buffer fBuffer = ffBuffer.get(); - auto ffactor = fBuffer.enqueue_write(0, sizeof(double), factor); - - futures.push_back(std::move(fa)); - futures.push_back(std::move(fsize)); - futures.push_back(std::move(fProg)); - futures.push_back(std::move(ffactor)); - - //Prepare kernel launch - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; - - size_t threads_per_block = std::min(1024,(int)size); - - //Set the values for the grid dimension - grid.x = (size + threads_per_block - 1) / threads_per_block; - grid.y = 1; - grid.z = 1; - - //Set the values for the block dimension - block.x = threads_per_block; - block.y = 1; - block.z = 1; - - //Prepare buffer arguments - std::vector args; - args.push_back(sizeBuffer); - args.push_back(aBuffer); - args.push_back(aBuffer); - args.push_back(fBuffer); - - hpx::wait_all(futures); - - // Check clock ticks ... - double t = mysecond(); - auto fk = prog.run(args, "multiply_step", grid, block,0); - hpx::wait_all(fk); - t = 1.0E6 * (mysecond() - t); - - // Get initial value for system clock. - int quantum = checktick(); - if (quantum >= 1) { - std::cout << "Your clock granularity/precision appears to be " - << quantum << " microseconds.\n"; - } else { - std::cout - << "Your clock granularity appears to be less than one microsecond.\n"; - quantum = 1; - } - - std::cout << "Each test below will take on the order" << " of " << (int) t - << " microseconds.\n" << " (= " << (int) (t / quantum) - << " clock ticks)\n" - << "Increase the size of the arrays if this shows that\n" - << "you are not getting at least 20 clock ticks per test.\n" - << "-------------------------------------------------------------\n"; - - std::cout << "WARNING -- The above is only a rough guideline.\n" - << "For best results, please be sure you know the\n" - << "precision of your system timer.\n" - << "-------------------------------------------------------------\n"; - - std::vector > timing(4, - std::vector(iterations)); - - factor[0] = 3.; - ffactor = fBuffer.enqueue_write(0, sizeof(double), factor); - hpx::wait_all(ffactor); - - for (std::size_t iteration = 0; iteration != iterations; ++iteration) { - - // Copy - timing[0][iteration] = mysecond(); - double* tmp = aBuffer.enqueue_read_sync(0,size *sizeof(double)); - auto fcopy = cBuffer.enqueue_write(0, size * sizeof(double), tmp); - hpx::wait_all(std::move(fcopy)); - timing[0][iteration] = mysecond() - timing[0][iteration]; - - // Scale - timing[1][iteration] = mysecond(); - args.clear(); - args.push_back(sizeBuffer); - args.push_back(cBuffer); - args.push_back(bBuffer); - args.push_back(fBuffer); - fk = prog.run(args, "multiply_step", grid, block,0); - hpx::wait_all(fk); - timing[1][iteration] = mysecond() - timing[1][iteration]; - - // Add - timing[2][iteration] = mysecond(); - args.clear(); - args.push_back(sizeBuffer); - args.push_back(aBuffer); - args.push_back(bBuffer); - args.push_back(cBuffer); - fk = prog.run(args, "add_step", grid, block,0); - hpx::wait_all(fk); - timing[2][iteration] = mysecond() - timing[2][iteration]; - - // Triad - timing[3][iteration] = mysecond(); - args.clear(); - args.push_back(sizeBuffer); - args.push_back(bBuffer); - args.push_back(cBuffer); - args.push_back(aBuffer); - args.push_back(fBuffer); - fk = prog.run(args, "triad_step", grid, block,0); - hpx::wait_all(fk); - timing[3][iteration] = mysecond() - timing[3][iteration]; - - } - - double* resa = aBuffer.enqueue_read_sync(0,size *sizeof(double)); - double* resb = bBuffer.enqueue_read_sync(0,size *sizeof(double)); - double* resc = cBuffer.enqueue_read_sync(0,size *sizeof(double)); - - - - check_results(iterations, size, resa, resb, resc); - - std::cout - << "-------------------------------------------------------------\n"; - - return timing; +std::vector> run_benchmark(size_t iterations, size_t size) { + // Vector for all futures for the data management + std::vector> futures; + + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(1, 0).get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + } + + // Get the cuda device + device cudaDevice = devices[0]; + + // Compile the kernels + + // Create the hello_world device program + program prog = cudaDevice.create_program_with_file("kernels.cu").get(); + + // Add compiler flags for compiling the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); + + flags.push_back(mode); + flags.push_back("-use_fast_math"); + + std::vector kernels; + kernels.push_back("multiply_step"); + kernels.push_back("add_step"); + kernels.push_back("triad_step"); + + // Compile the program + hpx::wait_all(prog); + auto fProg = prog.build(flags, kernels); + + // Fill the vector + double* a; + double* b; + double* c; + size_t* s; + double* factor; + + cudaMallocHost((void**)&a, sizeof(double) * size); + checkCudaError("Malloc a"); + cudaMallocHost((void**)&b, sizeof(double) * size); + checkCudaError("Malloc b"); + cudaMallocHost((void**)&c, sizeof(double) * size); + checkCudaError("Malloc c"); + cudaMallocHost((void**)&s, sizeof(size_t)); + checkCudaError("Malloc s"); + cudaMallocHost((void**)&factor, sizeof(double)); + checkCudaError("Malloc factor"); + s[0] = size; + factor[0] = 2.; + + for (size_t i = 0; i < size; i++) { + a[i] = 1.0; + b[i] = 2.0; + c[i] = 0.0; + } + + // Allocate device buffer + hpx::lcos::future faBuffer = + cudaDevice.create_buffer(size * sizeof(double)); + hpx::lcos::future fbBuffer = + cudaDevice.create_buffer(size * sizeof(double)); + hpx::lcos::future fcBuffer = + cudaDevice.create_buffer(size * sizeof(double)); + hpx::lcos::future fsizeBuffer = + cudaDevice.create_buffer(sizeof(size_t)); + hpx::lcos::future ffBuffer = + cudaDevice.create_buffer(sizeof(double)); + + // Fill device buffer + // hpx::wait_all(faBuffer); + buffer aBuffer = faBuffer.get(); + auto fa = aBuffer.enqueue_write(0, size * sizeof(double), a); + // hpx::wait_all(bBuffer); + buffer bBuffer = fbBuffer.get(); + auto fb = bBuffer.enqueue_write(0, size * sizeof(double), b); + // hpx::wait_all(cBuffer); + buffer cBuffer = fcBuffer.get(); + auto fc = cBuffer.enqueue_write(0, size * sizeof(double), c); + // hpx::wait_all(sizeBuffer); + buffer sizeBuffer = fsizeBuffer.get(); + auto fsize = sizeBuffer.enqueue_write(0, sizeof(size_t), s); + // hpx::wait_all(fBuffer); + buffer fBuffer = ffBuffer.get(); + auto ffactor = fBuffer.enqueue_write(0, sizeof(double), factor); + + futures.push_back(std::move(fa)); + futures.push_back(std::move(fsize)); + futures.push_back(std::move(fProg)); + futures.push_back(std::move(ffactor)); + + // Prepare kernel launch + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + size_t threads_per_block = std::min(1024, (int)size); + + // Set the values for the grid dimension + grid.x = (size + threads_per_block - 1) / threads_per_block; + grid.y = 1; + grid.z = 1; + + // Set the values for the block dimension + block.x = threads_per_block; + block.y = 1; + block.z = 1; + + // Prepare buffer arguments + std::vector args; + args.push_back(sizeBuffer); + args.push_back(aBuffer); + args.push_back(aBuffer); + args.push_back(fBuffer); + + hpx::wait_all(futures); + + // Check clock ticks ... + double t = mysecond(); + auto fk = prog.run(args, "multiply_step", grid, block, 0); + hpx::wait_all(fk); + t = 1.0E6 * (mysecond() - t); + + // Get initial value for system clock. + int quantum = checktick(); + if (quantum >= 1) { + std::cout << "Your clock granularity/precision appears to be " << quantum + << " microseconds.\n"; + } else { + std::cout + << "Your clock granularity appears to be less than one microsecond.\n"; + quantum = 1; + } + + std::cout + << "Each test below will take on the order" + << " of " << (int)t << " microseconds.\n" + << " (= " << (int)(t / quantum) << " clock ticks)\n" + << "Increase the size of the arrays if this shows that\n" + << "you are not getting at least 20 clock ticks per test.\n" + << "-------------------------------------------------------------\n"; + + std::cout + << "WARNING -- The above is only a rough guideline.\n" + << "For best results, please be sure you know the\n" + << "precision of your system timer.\n" + << "-------------------------------------------------------------\n"; + + std::vector> timing(4, std::vector(iterations)); + + factor[0] = 3.; + ffactor = fBuffer.enqueue_write(0, sizeof(double), factor); + hpx::wait_all(ffactor); + + for (std::size_t iteration = 0; iteration != iterations; ++iteration) { + // Copy + timing[0][iteration] = mysecond(); + double* tmp = aBuffer.enqueue_read_sync(0, size * sizeof(double)); + auto fcopy = cBuffer.enqueue_write(0, size * sizeof(double), tmp); + hpx::wait_all(std::move(fcopy)); + timing[0][iteration] = mysecond() - timing[0][iteration]; + + // Scale + timing[1][iteration] = mysecond(); + args.clear(); + args.push_back(sizeBuffer); + args.push_back(cBuffer); + args.push_back(bBuffer); + args.push_back(fBuffer); + fk = prog.run(args, "multiply_step", grid, block, 0); + hpx::wait_all(fk); + timing[1][iteration] = mysecond() - timing[1][iteration]; + + // Add + timing[2][iteration] = mysecond(); + args.clear(); + args.push_back(sizeBuffer); + args.push_back(aBuffer); + args.push_back(bBuffer); + args.push_back(cBuffer); + fk = prog.run(args, "add_step", grid, block, 0); + hpx::wait_all(fk); + timing[2][iteration] = mysecond() - timing[2][iteration]; + + // Triad + timing[3][iteration] = mysecond(); + args.clear(); + args.push_back(sizeBuffer); + args.push_back(bBuffer); + args.push_back(cBuffer); + args.push_back(aBuffer); + args.push_back(fBuffer); + fk = prog.run(args, "triad_step", grid, block, 0); + hpx::wait_all(fk); + timing[3][iteration] = mysecond() - timing[3][iteration]; + } + + double* resa = aBuffer.enqueue_read_sync(0, size * sizeof(double)); + double* resb = bBuffer.enqueue_read_sync(0, size * sizeof(double)); + double* resc = cBuffer.enqueue_read_sync(0, size * sizeof(double)); + + check_results(iterations, size, resa, resb, resc); + + std::cout + << "-------------------------------------------------------------\n"; + + return timing; } /////////////////////////////////////////////////////////////////////////////// -//Main +// Main /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char*argv[]) { - - if (argc != 3) { - std::cout << "Usage: " << argv[0] << " #elements #iterations" << std::endl; - exit(1); - } - - size_t size = atoi(argv[1]); - size_t iterations = atoi(argv[2]); - - std::cout - << "-------------------------------------------------------------\n" - << "Modified STREAM bechmark based on\nHPX version: " - << hpx::build_string() << "\n" - << "-------------------------------------------------------------\n" - << "This system uses " << sizeof(double) - << " bytes per array element.\n" - << "Memory per array = " - << sizeof(double) * (size / 1024. / 1024.) << " MiB " - << "(= " - << sizeof(double) * (size / 1024. / 1024. / 1024.) - << " GiB).\n" - << "-------------------------------------------------------------\n" - << "Each kernel will be executed " << iterations << " times.\n" - << " The *best* time for each kernel (excluding the first iteration)\n" - << " will be used to compute the reported bandwidth.\n" - << "-------------------------------------------------------------\n" - << "Number of Threads requested = " - << hpx::get_os_thread_count() << "\n" - << "-------------------------------------------------------------\n"; - - double time_total = mysecond(); - std::vector > timing; - timing = run_benchmark(iterations,size); - time_total = mysecond() - time_total; - - /* --- SUMMARY --- */ - const char *label[4] = { - "Copy: ", - "Scale: ", - "Add: ", - "Triad: " - }; - - const double bytes[4] = { - (double)(2 * sizeof(double) * size), - (double)(2 * sizeof(double) * size), - (double)(3 * sizeof(double) * size), - (double)(3 * sizeof(double) * size) - }; - - // Note: skip first iteration - std::vector avgtime(4, 0.0); - std::vector mintime(4, (std::numeric_limits::max)()); - std::vector maxtime(4, 0.0); - for(std::size_t iteration = 1; iteration != iterations; ++iteration) - { - for (std::size_t j=0; j<4; j++) - { - avgtime[j] = avgtime[j] + timing[j][iteration]; - mintime[j] = (std::min)(mintime[j], timing[j][iteration]); - maxtime[j] = (std::max)(maxtime[j], timing[j][iteration]); - } - } - - printf("Function Best Rate MB/s Avg time Min time Max time\n"); - for (std::size_t j=0; j<4; j++) { - avgtime[j] = avgtime[j]/(double)(iterations-1); - - printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], - 1.0E-06 * bytes[j]/mintime[j], - avgtime[j], - mintime[j], - maxtime[j]); - } - - std::cout - << "\nTotal time: " << time_total - << " (per iteration: " << time_total/iterations << ")\n"; - - std::cout - << "-------------------------------------------------------------\n" - ; - - return EXIT_SUCCESS; +int main(int argc, char* argv[]) { + if (argc != 3) { + std::cout << "Usage: " << argv[0] << " #elements #iterations" << std::endl; + exit(1); + } + + size_t size = atoi(argv[1]); + size_t iterations = atoi(argv[2]); + + std::cout + << "-------------------------------------------------------------\n" + << "Modified STREAM bechmark based on\nHPX version: " + << hpx::build_string() << "\n" + << "-------------------------------------------------------------\n" + << "This system uses " << sizeof(double) << " bytes per array element.\n" + << "Memory per array = " << sizeof(double) * (size / 1024. / 1024.) + << " MiB " + << "(= " << sizeof(double) * (size / 1024. / 1024. / 1024.) << " GiB).\n" + << "-------------------------------------------------------------\n" + << "Each kernel will be executed " << iterations << " times.\n" + << " The *best* time for each kernel (excluding the first iteration)\n" + << " will be used to compute the reported bandwidth.\n" + << "-------------------------------------------------------------\n" + << "Number of Threads requested = " << hpx::get_os_thread_count() << "\n" + << "-------------------------------------------------------------\n"; + + double time_total = mysecond(); + std::vector> timing; + timing = run_benchmark(iterations, size); + time_total = mysecond() - time_total; + + /* --- SUMMARY --- */ + const char* label[4] = { + "Copy: ", "Scale: ", "Add: ", "Triad: "}; + + const double bytes[4] = { + (double)(2 * sizeof(double) * size), (double)(2 * sizeof(double) * size), + (double)(3 * sizeof(double) * size), (double)(3 * sizeof(double) * size)}; + + // Note: skip first iteration + std::vector avgtime(4, 0.0); + std::vector mintime(4, (std::numeric_limits::max)()); + std::vector maxtime(4, 0.0); + for (std::size_t iteration = 1; iteration != iterations; ++iteration) { + for (std::size_t j = 0; j < 4; j++) { + avgtime[j] = avgtime[j] + timing[j][iteration]; + mintime[j] = (std::min)(mintime[j], timing[j][iteration]); + maxtime[j] = (std::max)(maxtime[j], timing[j][iteration]); + } + } + + printf("Function Best Rate MB/s Avg time Min time Max time\n"); + for (std::size_t j = 0; j < 4; j++) { + avgtime[j] = avgtime[j] / (double)(iterations - 1); + + printf("%s%12.1f %11.6f %11.6f %11.6f\n", label[j], + 1.0E-06 * bytes[j] / mintime[j], avgtime[j], mintime[j], maxtime[j]); + } + + std::cout << "\nTotal time: " << time_total + << " (per iteration: " << time_total / iterations << ")\n"; + + std::cout + << "-------------------------------------------------------------\n"; + + return EXIT_SUCCESS; } diff --git a/benchmark/opencl/dgemm/dgemmHPX.cpp b/benchmark/opencl/dgemm/dgemmHPX.cpp index e757cb24..5ac29016 100644 --- a/benchmark/opencl/dgemm/dgemmHPX.cpp +++ b/benchmark/opencl/dgemm/dgemmHPX.cpp @@ -12,202 +12,207 @@ using namespace hpx::opencl; -static const char dgemm_src_str[] = -" \n" -" __kernel void dgemm(__global double *A,__global double *B, __global double *C,__global int *m,__global int *n,__global int *k,__global double *alpha,__global double *beta) \n" -" { \n" -" int ROW = get_global_id(1); \n" -" int COL = get_global_id(0); \n" -" \n" -" if(ROW<(n[0]) && COL<(m[0])){ \n" -" double sum = 0.0; \n" -" for(int i = 0;i buffer_type; typedef hpx::serialization::serialize_buffer buffer_data_type; typedef hpx::serialization::serialize_buffer buffer_parameter_type; -static buffer_type dgemm_src( dgemm_src_str, - sizeof(dgemm_src_str), - buffer_type::init_mode::reference ); - +static buffer_type dgemm_src(dgemm_src_str, sizeof(dgemm_src_str), + buffer_type::init_mode::reference); -int main(int argc, char* argv[]) -{ - - if (argc != 4) { - std::cout << "Usage: " << argv[0] << " #m #n #k"; - exit(1); - } +int main(int argc, char *argv[]) { + if (argc != 4) { + std::cout << "Usage: " << argv[0] << " #m #n #k"; + exit(1); + } - int *m,*n,*k,i; - - //allocating memory for the vectors - m = new int[1]; - n = new int[1]; - k = new int[1]; - - //Initilizing the matrix dimensions - m[0] = atoi(argv[1]); - n[0] = atoi(argv[2]); - k[0] = atoi(argv[3]); + int *m, *n, *k, i; - double time = 0; - timer_start(); - - // Get available OpenCL Devices. - std::vector devices = create_all_devices(CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); - - // Check if any devices are available - if(devices.size() < 1) - { - hpx::cerr << "No OpenCL devices found!" << hpx::endl; - return hpx::finalize(); - } - - - double *alpha, *beta; - - alpha = new double[1]; - beta = new double[1]; - - // Create a device component from the first device found - device cldevice = devices[0]; - - double *A, *B, *C; - - A = new double[m[0]*k[0]]; - B = new double[k[0]*n[0]]; - C = new double[m[0]*n[0]]; - - //initializing values of alpha and beta - alpha[0] = 1.0; - beta[0] = 0.0; - - time+=timer_stop(); - //printf (" Intializing matrix data \n\n"); - timer_start(); - for (i = 0; i < (m[0]*k[0]); i++) { - A[i] = (double)(i+1); - } - - for (i = 0; i < (k[0]*n[0]); i++) { - B[i] = (double)(-i-1); - } - - for (i = 0; i < (m[0]*n[0]); i++) { - C[i] = 0.0; - } - - //creating buffers - buffer ABuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, m[0]*k[0]*sizeof( double )); - buffer BBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, n[0]*k[0]*sizeof( double )); - buffer CBuffer = cldevice.create_buffer(CL_MEM_READ_WRITE, m[0]*n[0]*sizeof( double )); - buffer alphaBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(double)); - buffer betaBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(double)); - buffer mBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY,sizeof(int)); - buffer nBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); - buffer kBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); - - // Initialize a list of future events for asynchronous set_arg calls - std::vector> set_arg_futures; - std::vector> write_futures; - - // Create the hello_world device program - program prog = cldevice.create_program_with_source(dgemm_src); - - //Build the program - prog.build(); - - buffer_data_type A_serialized( - A, m[0]*k[0], - buffer_data_type::init_mode::reference); - - buffer_data_type B_serialized( - B, k[0]*n[0], - buffer_data_type::init_mode::reference); - - buffer_data_type C_serialized( - C, m[0]*n[0], - buffer_data_type::init_mode::reference); - - buffer_data_type alpha_serialized( - alpha, 1, - buffer_data_type::init_mode::reference); - - buffer_data_type beta_serialized( - beta, 1, - buffer_data_type::init_mode::reference); - - buffer_parameter_type m_serialized( - m, 1, - buffer_parameter_type::init_mode::reference); - - buffer_parameter_type n_serialized( - n, 1, - buffer_parameter_type::init_mode::reference); - - buffer_parameter_type k_serialized( - k, 1, - buffer_parameter_type::init_mode::reference); - - //Write data to the buffers - write_futures.push_back(ABuffer.enqueue_write(0, A_serialized)); - write_futures.push_back(BBuffer.enqueue_write(0, B_serialized)); - write_futures.push_back(CBuffer.enqueue_write(0, C_serialized)); - write_futures.push_back(alphaBuffer.enqueue_write(0, alpha_serialized)); - write_futures.push_back(betaBuffer.enqueue_write(0, beta_serialized)); - write_futures.push_back(mBuffer.enqueue_write(0, m_serialized)); - write_futures.push_back(nBuffer.enqueue_write(0, n_serialized)); - write_futures.push_back(kBuffer.enqueue_write(0, k_serialized)); - - // wait for function calls to trigger - hpx::wait_all( write_futures ); - - //Creating the kernal - kernel dgemm_kernel = prog.create_kernel("dgemm"); - - //Set buffers as arguments - set_arg_futures.push_back(dgemm_kernel.set_arg_async(0, ABuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(1, BBuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(2, CBuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(3, mBuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(4, nBuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(5, kBuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(6, alphaBuffer)); - set_arg_futures.push_back(dgemm_kernel.set_arg_async(7, betaBuffer)); - - // wait for function calls to trigger - hpx::wait_all( set_arg_futures ); - - // Run the kernel - hpx::opencl::work_size<2> dim; - dim[0].offset = 0; - dim[0].size = (int)(std::pow(2,std::ceil(std::log(m[0])/std::log(2)))); - dim[0].local_size = 32; - dim[1].offset = 0; - dim[1].size = (int)(std::pow(2,std::ceil(std::log(n[0])/std::log(2)))); - dim[1].local_size = 32; - - hpx::future kernel_future = dgemm_kernel.enqueue(dim); - - // Start reading the buffer ( With kernel_future as dependency. - // All hpxcl enqueue calls are nonblocking. ) - auto read_future = CBuffer.enqueue_read(0, C_serialized, kernel_future); - - // Wait for the data to arrive - auto data = read_future.get(); - - //Printing the end timing result - time+=timer_stop(); - std:: cout << time << std::endl; - - return 0; + // allocating memory for the vectors + m = new int[1]; + n = new int[1]; + k = new int[1]; + + // Initilizing the matrix dimensions + m[0] = atoi(argv[1]); + n[0] = atoi(argv[2]); + k[0] = atoi(argv[3]); + + double time = 0; + timer_start(); + + // Get available OpenCL Devices. + std::vector devices = + create_all_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1").get(); + + // Check if any devices are available + if (devices.size() < 1) { + hpx::cerr << "No OpenCL devices found!" << hpx::endl; + return hpx::finalize(); + } + + double *alpha, *beta; + + alpha = new double[1]; + beta = new double[1]; + + // Create a device component from the first device found + device cldevice = devices[0]; + + double *A, *B, *C; + + A = new double[m[0] * k[0]]; + B = new double[k[0] * n[0]]; + C = new double[m[0] * n[0]]; + + // initializing values of alpha and beta + alpha[0] = 1.0; + beta[0] = 0.0; + + time += timer_stop(); + // printf (" Intializing matrix data \n\n"); + timer_start(); + for (i = 0; i < (m[0] * k[0]); i++) { + A[i] = (double)(i + 1); + } + + for (i = 0; i < (k[0] * n[0]); i++) { + B[i] = (double)(-i - 1); + } + + for (i = 0; i < (m[0] * n[0]); i++) { + C[i] = 0.0; + } + + // creating buffers + buffer ABuffer = + cldevice.create_buffer(CL_MEM_READ_ONLY, m[0] * k[0] * sizeof(double)); + buffer BBuffer = + cldevice.create_buffer(CL_MEM_READ_ONLY, n[0] * k[0] * sizeof(double)); + buffer CBuffer = + cldevice.create_buffer(CL_MEM_READ_WRITE, m[0] * n[0] * sizeof(double)); + buffer alphaBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(double)); + buffer betaBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(double)); + buffer mBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); + buffer nBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); + buffer kBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); + + // Initialize a list of future events for asynchronous set_arg calls + std::vector> set_arg_futures; + std::vector> write_futures; + + // Create the hello_world device program + program prog = cldevice.create_program_with_source(dgemm_src); + + // Build the program + prog.build(); + + buffer_data_type A_serialized(A, m[0] * k[0], + buffer_data_type::init_mode::reference); + + buffer_data_type B_serialized(B, k[0] * n[0], + buffer_data_type::init_mode::reference); + + buffer_data_type C_serialized(C, m[0] * n[0], + buffer_data_type::init_mode::reference); + + buffer_data_type alpha_serialized(alpha, 1, + buffer_data_type::init_mode::reference); + + buffer_data_type beta_serialized(beta, 1, + buffer_data_type::init_mode::reference); + + buffer_parameter_type m_serialized( + m, 1, buffer_parameter_type::init_mode::reference); + + buffer_parameter_type n_serialized( + n, 1, buffer_parameter_type::init_mode::reference); + + buffer_parameter_type k_serialized( + k, 1, buffer_parameter_type::init_mode::reference); + + // Write data to the buffers + write_futures.push_back(ABuffer.enqueue_write(0, A_serialized)); + write_futures.push_back(BBuffer.enqueue_write(0, B_serialized)); + write_futures.push_back(CBuffer.enqueue_write(0, C_serialized)); + write_futures.push_back(alphaBuffer.enqueue_write(0, alpha_serialized)); + write_futures.push_back(betaBuffer.enqueue_write(0, beta_serialized)); + write_futures.push_back(mBuffer.enqueue_write(0, m_serialized)); + write_futures.push_back(nBuffer.enqueue_write(0, n_serialized)); + write_futures.push_back(kBuffer.enqueue_write(0, k_serialized)); + + // wait for function calls to trigger + hpx::wait_all(write_futures); + + // Creating the kernal + kernel dgemm_kernel = prog.create_kernel("dgemm"); + + // Set buffers as arguments + set_arg_futures.push_back(dgemm_kernel.set_arg_async(0, ABuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(1, BBuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(2, CBuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(3, mBuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(4, nBuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(5, kBuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(6, alphaBuffer)); + set_arg_futures.push_back(dgemm_kernel.set_arg_async(7, betaBuffer)); + + // wait for function calls to trigger + hpx::wait_all(set_arg_futures); + + // Run the kernel + hpx::opencl::work_size<2> dim; + dim[0].offset = 0; + dim[0].size = (int)(std::pow(2, std::ceil(std::log(m[0]) / std::log(2)))); + dim[0].local_size = 32; + dim[1].offset = 0; + dim[1].size = (int)(std::pow(2, std::ceil(std::log(n[0]) / std::log(2)))); + dim[1].local_size = 32; + + hpx::future kernel_future = dgemm_kernel.enqueue(dim); + + // Start reading the buffer ( With kernel_future as dependency. + // All hpxcl enqueue calls are nonblocking. ) + auto read_future = CBuffer.enqueue_read(0, C_serialized, kernel_future); + + // Wait for the data to arrive + auto data = read_future.get(); + + // Printing the end timing result + time += timer_stop(); + std::cout << time << std::endl; + + return 0; } \ No newline at end of file diff --git a/benchmark/opencl/smvp/smvpHPX.cpp b/benchmark/opencl/smvp/smvpHPX.cpp index cf429682..4bc8b014 100644 --- a/benchmark/opencl/smvp/smvpHPX.cpp +++ b/benchmark/opencl/smvp/smvpHPX.cpp @@ -12,241 +12,266 @@ using namespace hpx::opencl; -static const char smvp_src_str[] = -" \n" -"__kernel void smvp(__global double *A_data,__global int *A_indices, __global int *A_pointers, \n" -"__global double *B, __global double *C, __global int *m, __global int *n, __global int *count,\n" -"__global double *alpha) \n" -"{ \n" -" int ROW = get_global_id(0); \n" -" \n" -" if(ROW buffer_type; typedef hpx::serialization::serialize_buffer buffer_data_type; typedef hpx::serialization::serialize_buffer buffer_parameter_type; -static buffer_type smvp_src( smvp_src_str, - sizeof(smvp_src_str), - buffer_type::init_mode::reference ); +static buffer_type smvp_src(smvp_src_str, sizeof(smvp_src_str), + buffer_type::init_mode::reference); //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char* argv[]) -{ +int main(int argc, char *argv[]) { + if (argc != 3) { + std::cout << "Usage: " << argv[0] << " #m #n"; + exit(1); + } - if (argc != 3) { - std::cout << "Usage: " << argv[0] << " #m #n"; - exit(1); - } + int *m, *n, i; - int *m,*n,i; + // allocating memory for the vectors + m = new int[1]; + n = new int[1]; - //allocating memory for the vectors - m = new int[1]; - n = new int[1]; + m[0] = atoi(argv[1]); + n[0] = atoi(argv[2]); - m[0] = atoi(argv[1]); - n[0] = atoi(argv[2]); + double time = 0; + timer_start(); - double time = 0; - timer_start(); + // Get available OpenCL Devices. + std::vector devices = + create_all_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1").get(); - // Get available OpenCL Devices. - std::vector devices = create_all_devices(CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); + // Check if any devices are available + if (devices.size() < 1) { + hpx::cerr << "No OpenCL devices found!" << hpx::endl; + return hpx::finalize(); + } - // Check if any devices are available - if(devices.size() < 1) - { - hpx::cerr << "No OpenCL devices found!" << hpx::endl; - return hpx::finalize(); - } + double *alpha; + int *count; + + alpha = new double[1]; + count = new int[1]; + + // Create a device component from the first device found + device cldevice = devices[0]; + + // Create the hello_world device program + program prog = cldevice.create_program_with_source(smvp_src); + + // Build the program + auto program_future = prog.build_async(); + + double *A, *B, *C; - double *alpha; - int *count; - - alpha = new double[1]; - count = new int[1]; - - // Create a device component from the first device found - device cldevice = devices[0]; - - // Create the hello_world device program - program prog = cldevice.create_program_with_source(smvp_src); - - //Build the program - auto program_future = prog.build_async(); - - - double *A, *B, *C; - - double *A_data; - int *A_indices, *A_pointers; - - A = new double[m[0]*n[0]]; - B = new double[n[0]]; - C = new double[m[0]]; - - //initializing values of alpha and beta - alpha[0] = 1.0; - count[0] = 0; - - //Input can be anything sparse - for (i = 0; i < (m[0]*n[0]); i++) { - if((i%n[0]) == 0){ - A[i] = (double)(i+1); - count[0]++; - } - } - - A_data = new double[count[0]]; - A_indices = new int[count[0]]; - A_pointers = new int[m[0]]; - - - for (i = 0; i < (1*n[0]); i++) { - B[i] = (double)(-i-1); - } - - for (i = 0; i < (m[0]*1); i++) { - C[i] = 0.0; - } - - //Counters for compression - int data_counter = 0; - int index_counter = 0; - int pointer_counter = -1; - - //Compressing Matrix A - for (i = 0; i < (m[0]*n[0]); i++) { - if(A[i] != 0) - { - A_data[data_counter++] = A[i]; - if(((int)i/n[0]) != pointer_counter) - A_pointers[++pointer_counter] = index_counter; - A_indices[index_counter++] = (i%n[0]); - } - } - - //creating buffers - buffer ADataBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, (count[0])*sizeof( double )); - buffer AIndexBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, (count[0])*sizeof( int )); - buffer APointerBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, m[0]*sizeof( int )); - - buffer BBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, n[0]*sizeof( double )); - buffer CBuffer = cldevice.create_buffer(CL_MEM_READ_WRITE, m[0]*sizeof( double )); - buffer alphaBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(double)); - buffer countBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); - buffer mBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY,sizeof(int)); - buffer nBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); - - // Initialize a list of future events for asynchronous set_arg calls - std::vector> set_arg_futures; - std::vector> write_futures; - - buffer_data_type AData_serialized( - A_data, (*count), - buffer_data_type::init_mode::reference); - - buffer_parameter_type AIndex_serialized( - A_indices, (*count), - buffer_parameter_type::init_mode::reference); - - buffer_parameter_type APointer_serialized( - A_pointers, m[0], - buffer_parameter_type::init_mode::reference); - - buffer_data_type B_serialized( - B, n[0], - buffer_data_type::init_mode::reference); - - buffer_data_type C_serialized( - C, m[0], - buffer_data_type::init_mode::reference); - - buffer_data_type alpha_serialized( - alpha, 1, - buffer_data_type::init_mode::reference); - - buffer_parameter_type count_serialized( - count, 1, - buffer_parameter_type::init_mode::reference); - - buffer_parameter_type m_serialized( - m, 1, - buffer_parameter_type::init_mode::reference); - - buffer_parameter_type n_serialized( - n, 1, - buffer_parameter_type::init_mode::reference); - - //Write data to the buffers - write_futures.push_back(ADataBuffer.enqueue_write(0, AData_serialized)); - write_futures.push_back(AIndexBuffer.enqueue_write(0, AIndex_serialized)); - write_futures.push_back(APointerBuffer.enqueue_write(0, APointer_serialized)); - - write_futures.push_back(BBuffer.enqueue_write(0, B_serialized)); - write_futures.push_back(CBuffer.enqueue_write(0, C_serialized)); - write_futures.push_back(alphaBuffer.enqueue_write(0, alpha_serialized)); - write_futures.push_back(mBuffer.enqueue_write(0, m_serialized)); - write_futures.push_back(nBuffer.enqueue_write(0, n_serialized)); - write_futures.push_back(countBuffer.enqueue_write(0, count_serialized)); - - // wait for function calls to trigger - hpx::wait_all( write_futures ); - - hpx::wait_all(program_future); - - //Creating the kernal - kernel smvp_kernel = prog.create_kernel("smvp"); - - //Set buffers as arguments - set_arg_futures.push_back(smvp_kernel.set_arg_async(0, ADataBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(1, AIndexBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(2, APointerBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(3, BBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(4, CBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(5, mBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(6, nBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(7, countBuffer)); - set_arg_futures.push_back(smvp_kernel.set_arg_async(8, alphaBuffer)); - - // wait for function calls to trigger - hpx::wait_all( set_arg_futures ); - - // Run the kernel - hpx::opencl::work_size<1> dim; - dim[0].offset = 0; - dim[0].size = (int)(std::pow(2,std::ceil(std::log(m[0])/std::log(2)))); - dim[0].local_size = 32; - - hpx::future kernel_future = smvp_kernel.enqueue(dim); - - // Start reading the buffer ( With kernel_future as dependency. - // All hpxcl enqueue calls are nonblocking. ) - auto read_future = CBuffer.enqueue_read(0, C_serialized, kernel_future); - - // Wait for the data to arrive - auto data = read_future.get(); - - //Printing the end timing result - time+=timer_stop(); - std:: cout << time << std::endl; - - return 0; + double *A_data; + int *A_indices, *A_pointers; + + A = new double[m[0] * n[0]]; + B = new double[n[0]]; + C = new double[m[0]]; + + // initializing values of alpha and beta + alpha[0] = 1.0; + count[0] = 0; + + // Input can be anything sparse + for (i = 0; i < (m[0] * n[0]); i++) { + if ((i % n[0]) == 0) { + A[i] = (double)(i + 1); + count[0]++; + } + } + + A_data = new double[count[0]]; + A_indices = new int[count[0]]; + A_pointers = new int[m[0]]; + + for (i = 0; i < (1 * n[0]); i++) { + B[i] = (double)(-i - 1); + } + + for (i = 0; i < (m[0] * 1); i++) { + C[i] = 0.0; + } + + // Counters for compression + int data_counter = 0; + int index_counter = 0; + int pointer_counter = -1; + + // Compressing Matrix A + for (i = 0; i < (m[0] * n[0]); i++) { + if (A[i] != 0) { + A_data[data_counter++] = A[i]; + if (((int)i / n[0]) != pointer_counter) + A_pointers[++pointer_counter] = index_counter; + A_indices[index_counter++] = (i % n[0]); + } + } + + // creating buffers + buffer ADataBuffer = + cldevice.create_buffer(CL_MEM_READ_ONLY, (count[0]) * sizeof(double)); + buffer AIndexBuffer = + cldevice.create_buffer(CL_MEM_READ_ONLY, (count[0]) * sizeof(int)); + buffer APointerBuffer = + cldevice.create_buffer(CL_MEM_READ_ONLY, m[0] * sizeof(int)); + + buffer BBuffer = + cldevice.create_buffer(CL_MEM_READ_ONLY, n[0] * sizeof(double)); + buffer CBuffer = + cldevice.create_buffer(CL_MEM_READ_WRITE, m[0] * sizeof(double)); + buffer alphaBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(double)); + buffer countBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); + buffer mBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); + buffer nBuffer = cldevice.create_buffer(CL_MEM_READ_ONLY, sizeof(int)); + + // Initialize a list of future events for asynchronous set_arg calls + std::vector> set_arg_futures; + std::vector> write_futures; + + buffer_data_type AData_serialized(A_data, (*count), + buffer_data_type::init_mode::reference); + + buffer_parameter_type AIndex_serialized( + A_indices, (*count), buffer_parameter_type::init_mode::reference); + + buffer_parameter_type APointer_serialized( + A_pointers, m[0], buffer_parameter_type::init_mode::reference); + + buffer_data_type B_serialized(B, n[0], + buffer_data_type::init_mode::reference); + + buffer_data_type C_serialized(C, m[0], + buffer_data_type::init_mode::reference); + + buffer_data_type alpha_serialized(alpha, 1, + buffer_data_type::init_mode::reference); + + buffer_parameter_type count_serialized( + count, 1, buffer_parameter_type::init_mode::reference); + + buffer_parameter_type m_serialized( + m, 1, buffer_parameter_type::init_mode::reference); + + buffer_parameter_type n_serialized( + n, 1, buffer_parameter_type::init_mode::reference); + + // Write data to the buffers + write_futures.push_back(ADataBuffer.enqueue_write(0, AData_serialized)); + write_futures.push_back(AIndexBuffer.enqueue_write(0, AIndex_serialized)); + write_futures.push_back(APointerBuffer.enqueue_write(0, APointer_serialized)); + + write_futures.push_back(BBuffer.enqueue_write(0, B_serialized)); + write_futures.push_back(CBuffer.enqueue_write(0, C_serialized)); + write_futures.push_back(alphaBuffer.enqueue_write(0, alpha_serialized)); + write_futures.push_back(mBuffer.enqueue_write(0, m_serialized)); + write_futures.push_back(nBuffer.enqueue_write(0, n_serialized)); + write_futures.push_back(countBuffer.enqueue_write(0, count_serialized)); + + // wait for function calls to trigger + hpx::wait_all(write_futures); + + hpx::wait_all(program_future); + + // Creating the kernal + kernel smvp_kernel = prog.create_kernel("smvp"); + + // Set buffers as arguments + set_arg_futures.push_back(smvp_kernel.set_arg_async(0, ADataBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(1, AIndexBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(2, APointerBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(3, BBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(4, CBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(5, mBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(6, nBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(7, countBuffer)); + set_arg_futures.push_back(smvp_kernel.set_arg_async(8, alphaBuffer)); + + // wait for function calls to trigger + hpx::wait_all(set_arg_futures); + + // Run the kernel + hpx::opencl::work_size<1> dim; + dim[0].offset = 0; + dim[0].size = (int)(std::pow(2, std::ceil(std::log(m[0]) / std::log(2)))); + dim[0].local_size = 32; + + hpx::future kernel_future = smvp_kernel.enqueue(dim); + + // Start reading the buffer ( With kernel_future as dependency. + // All hpxcl enqueue calls are nonblocking. ) + auto read_future = CBuffer.enqueue_read(0, C_serialized, kernel_future); + + // Wait for the data to arrive + auto data = read_future.get(); + + // Printing the end timing result + time += timer_stop(); + std::cout << time << std::endl; + + return 0; } \ No newline at end of file diff --git a/cmake/EmbedResources.cpp b/cmake/EmbedResources.cpp index 1542d096..03a32ee0 100644 --- a/cmake/EmbedResources.cpp +++ b/cmake/EmbedResources.cpp @@ -8,107 +8,101 @@ #include std::vector split(const std::string& s, const std::string& delim) { - std::vector result; - if (delim.empty()) { - result.push_back(s); - return result; + std::vector result; + if (delim.empty()) { + result.push_back(s); + return result; + } + std::string::const_iterator substart = s.begin(); + std::string::const_iterator subend; + while (true) { + subend = std::search(substart, s.end(), delim.begin(), delim.end()); + std::string temp(substart, subend); + if (!temp.empty()) { + result.push_back(temp); } - std::string::const_iterator substart = s.begin(); - std::string::const_iterator subend; - while (true) { - subend = std::search(substart, s.end(), delim.begin(), delim.end()); - std::string temp(substart, subend); - if (!temp.empty()) { - result.push_back(temp); - } - if (subend == s.end()) { - break; - } - substart = subend + delim.size(); + if (subend == s.end()) { + break; } - return result; + substart = subend + delim.size(); + } + return result; } -static bool is_not_alnum(char c) -{ - return !std::isalnum(c, std::locale("C")); -} +static bool is_not_alnum(char c) { return !std::isalnum(c, std::locale("C")); } -int main(int argc, const char** argv) -{ - // check for correct command line arguments - if(argc < 3 || argc > 4){ - std::cout << "Usage: " << argv[0] << " infile outfile [namespace]" << std::endl; - return 2; - } +int main(int argc, const char** argv) { + // check for correct command line arguments + if (argc < 3 || argc > 4) { + std::cout << "Usage: " << argv[0] << " infile outfile [namespace]" + << std::endl; + return 2; + } - // parse command line - const char* iname = argv[1]; - const char* oname = argv[2]; - const char* ns = ((argc >= 4)?(argv[3]):""); + // parse command line + const char* iname = argv[1]; + const char* oname = argv[2]; + const char* ns = ((argc >= 4) ? (argv[3]) : ""); - // calculate the char array variable name - std::string varname(iname); - std::replace_if(varname.begin(), varname.end(), is_not_alnum, '_'); + // calculate the char array variable name + std::string varname(iname); + std::replace_if(varname.begin(), varname.end(), is_not_alnum, '_'); - // calculate the namespaces - std::vector namespaces = split(ns, "::"); -/* std::stringstream nss(ns); - std::vector namespaces; - std::string ns_part; - while(std::getline(nss, ns_part, ':')){ - namespaces.push_back(ns_part); - } -*/ - // open input file - std::ifstream ifile(iname, std::ios::in | std::ios::binary); - if(!ifile.is_open()){ - std::cerr << "Unable to open file '" << iname << "'!" << std::endl; - return 1; - } - - // open output file - std::ofstream ofile(oname, std::ios::out | std::ios::binary); - if(!ofile.is_open()){ - std::cerr << "Unable to write to file '" << oname << "'!" << std::endl; - return 1; - } + // calculate the namespaces + std::vector namespaces = split(ns, "::"); + /* std::stringstream nss(ns); + std::vector namespaces; + std::string ns_part; + while(std::getline(nss, ns_part, ':')){ + namespaces.push_back(ns_part); + } + */ + // open input file + std::ifstream ifile(iname, std::ios::in | std::ios::binary); + if (!ifile.is_open()) { + std::cerr << "Unable to open file '" << iname << "'!" << std::endl; + return 1; + } - // open namespace brackets - for(std::size_t i = 0; i < namespaces.size(); i++) - { - ofile << "namespace " << namespaces[i] << "{" << std::endl; - } + // open output file + std::ofstream ofile(oname, std::ios::out | std::ios::binary); + if (!ofile.is_open()) { + std::cerr << "Unable to write to file '" << oname << "'!" << std::endl; + return 1; + } - // write all bytes to the array - ofile << "extern const char " << varname << "[] = {"; - unsigned long numchars = 0; - unsigned char c = ifile.get(); - while(ifile.good()){ - if(numchars % 20 == 0){ - ofile << "\n "; - } - numchars++; + // open namespace brackets + for (std::size_t i = 0; i < namespaces.size(); i++) { + ofile << "namespace " << namespaces[i] << "{" << std::endl; + } - ofile << "'\\x" << std::hex << std::setw(2) << std::setfill('0') - << (int)c << "',"; - c = ifile.get(); + // write all bytes to the array + ofile << "extern const char " << varname << "[] = {"; + unsigned long numchars = 0; + unsigned char c = ifile.get(); + while (ifile.good()) { + if (numchars % 20 == 0) { + ofile << "\n "; } - ofile << "\n};" << std::endl; + numchars++; - // write the array length - ofile << "extern const unsigned long " << varname << "_len = " << std::dec << std::setw(0) - << std::setfill(' ') << numchars << ";" << std::endl; + ofile << "'\\x" << std::hex << std::setw(2) << std::setfill('0') << (int)c + << "',"; + c = ifile.get(); + } + ofile << "\n};" << std::endl; + // write the array length + ofile << "extern const unsigned long " << varname << "_len = " << std::dec + << std::setw(0) << std::setfill(' ') << numchars << ";" << std::endl; - // close namespace brackets - for(std::size_t i = 0; i < namespaces.size(); i++) - { - ofile << "}" << std::endl; - }; + // close namespace brackets + for (std::size_t i = 0; i < namespaces.size(); i++) { + ofile << "}" << std::endl; + }; - // close files, return success - ofile.close(); - ifile.close(); - return 0; + // close files, return success + ofile.close(); + ifile.close(); + return 0; } diff --git a/cuda/buffer.cpp b/cuda/buffer.cpp index 588e9df8..c7c622a9 100644 --- a/cuda/buffer.cpp +++ b/cuda/buffer.cpp @@ -8,38 +8,28 @@ #include "cuda/buffer.hpp" -typedef hpx::components::managed_component< - hpx::cuda::server::buffer> +typedef hpx::components::managed_component cuda_buffer_type; HPX_REGISTER_COMPONENT_MODULE(); HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(cuda_buffer_type, cuda_buffer); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::size_action, - cuda_buffer_size_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::set_size_action, - cuda_buffer_set_size_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::enqueue_read_action, - cuda_buffer_enqueue_read_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::enqueue_write_action, - cuda_buffer_enqueue_write_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::enqueue_write_local_action, - cuda_buffer_enqueue_write_local_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::enqueue_read_local_action, - cuda_buffer_enqueue_read_local_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::get_device_pointer_action, - cuda_buffer_get_device_pointer_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::get_device_id_action, - cuda_buffer_get_device_id_action); -HPX_REGISTER_ACTION( - cuda_buffer_type::wrapped_type::p2p_copy_action, - cuda_buffer_p2p_copy_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::size_action, + cuda_buffer_size_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::set_size_action, + cuda_buffer_set_size_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::enqueue_read_action, + cuda_buffer_enqueue_read_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::enqueue_write_action, + cuda_buffer_enqueue_write_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::enqueue_write_local_action, + cuda_buffer_enqueue_write_local_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::enqueue_read_local_action, + cuda_buffer_enqueue_read_local_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::get_device_pointer_action, + cuda_buffer_get_device_pointer_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::get_device_id_action, + cuda_buffer_get_device_id_action); +HPX_REGISTER_ACTION(cuda_buffer_type::wrapped_type::p2p_copy_action, + cuda_buffer_p2p_copy_action); diff --git a/cuda/buffer.hpp b/cuda/buffer.hpp index dc7225a1..35cebba0 100644 --- a/cuda/buffer.hpp +++ b/cuda/buffer.hpp @@ -18,215 +18,184 @@ namespace hpx { namespace cuda { /** -* \brief Device memory. -* -* Every buffer belongs to one \ref device. -*/ -class buffer: public hpx::components::client_base { - typedef hpx::components::client_base base_type; - -public: - buffer() { - } - - buffer(hpx::future && gid) : - base_type(std::move(gid)) { - - is_local = (hpx::get_colocation_id(hpx::launch::sync, get_id()) == hpx::find_here()); - } - - /** - * \brief Method returns the buffer's size - * \return The buffer size - */ - - hpx::lcos::future size() { - HPX_ASSERT(this->get_id()); - typedef server::buffer::size_action action_type; - return hpx::async(this->get_id()); - - } - - size_t size_sync() { - return size().get(); - - } - - /** - * \brief Method sets the buffer's size - * \param size The size of the buffer - * - * \note Use this methods carefully for extending a buffer. Adaptive buffer - * on GPU are mostly performing bad. - */ - - hpx::lcos::future set_size(size_t size) { - HPX_ASSERT(this->get_id()); - typedef server::buffer::set_size_action action_type; - return hpx::async(this->get_id(), size); - - } - - void set_size_sync(size_t size) { - set_size(size).get(); - - } - - /** - * \brief Method copy synchronized the data on the attached device to the - * host. - * \param offset Offset, where to start copying data - * \param size Size of the data on the device - * \return Pointer to the data on the host - */ - - template - T* enqueue_read_sync(size_t offset, size_t size) { - - if (is_local) { - - return reinterpret_cast(enqueue_read_local(offset, size).get()); - - } else { - - return (T*) enqueue_read(offset, size).get().data(); - - } - - } - - /** - * \brief Method to access the device pointer wrapped as a smart pointer - * \return The pointer to the device memory as a smart pointer - */ - hpx::lcos::future get_device_pointer(){ - - HPX_ASSERT(this->get_id()); - - typedef server::buffer::get_device_pointer_action action_type; - return hpx::async(this->get_id()); - } - - - /** - * \brief Method to access the device pointer wrapped as a smart pointer - * \return The pointer to the device memory as a smart pointer - */ - uintptr_t get_device_pointer_sync(){ - - return get_device_pointer().get(); + * \brief Device memory. + * + * Every buffer belongs to one \ref device. + */ +class buffer + : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + buffer() {} + + buffer(hpx::future&& gid) : base_type(std::move(gid)) { + is_local = (hpx::agas::get_colocation_id(hpx::launch::sync, get_id()) == + hpx::find_here()); + } + + /** + * \brief Method returns the buffer's size + * \return The buffer size + */ + + hpx::lcos::future size() { + HPX_ASSERT(this->get_id()); + typedef server::buffer::size_action action_type; + return hpx::async(this->get_id()); + } + + size_t size_sync() { return size().get(); } + + /** + * \brief Method sets the buffer's size + * \param size The size of the buffer + * + * \note Use this methods carefully for extending a buffer. Adaptive buffer + * on GPU are mostly performing bad. + */ + + hpx::lcos::future set_size(size_t size) { + HPX_ASSERT(this->get_id()); + typedef server::buffer::set_size_action action_type; + return hpx::async(this->get_id(), size); + } + + void set_size_sync(size_t size) { set_size(size).get(); } + + /** + * \brief Method copy synchronized the data on the attached device to the + * host. + * \param offset Offset, where to start copying data + * \param size Size of the data on the device + * \return Pointer to the data on the host + */ + + template + T* enqueue_read_sync(size_t offset, size_t size) { + if (is_local) { + return reinterpret_cast(enqueue_read_local(offset, size).get()); + + } else { + return (T*)enqueue_read(offset, size).get().data(); } - - - /** - * \brief Method to access the device pointer - * \return The raw pointer to the device memory allocated in this buffer - */ - void* get_device_pointer_unwrapped_sync(){ - + } + + /** + * \brief Method to access the device pointer wrapped as a smart pointer + * \return The pointer to the device memory as a smart pointer + */ + hpx::lcos::future get_device_pointer() { + HPX_ASSERT(this->get_id()); + + typedef server::buffer::get_device_pointer_action action_type; + return hpx::async(this->get_id()); + } + + /** + * \brief Method to access the device pointer wrapped as a smart pointer + * \return The pointer to the device memory as a smart pointer + */ + uintptr_t get_device_pointer_sync() { return get_device_pointer().get(); } + + /** + * \brief Method to access the device pointer + * \return The raw pointer to the device memory allocated in this buffer + */ + void* get_device_pointer_unwrapped_sync() { return (void*)(get_device_pointer_sync()); - - } - - /** - * \brief Method to access the buffer's device ID - * \return The ID of the device where the buffer is allocated - */ - hpx::lcos::future get_device_id(){ - - HPX_ASSERT(this->get_id()); - - typedef server::buffer::get_device_id_action action_type; - return hpx::async(this->get_id()); + } + + /** + * \brief Method to access the buffer's device ID + * \return The ID of the device where the buffer is allocated + */ + hpx::lcos::future get_device_id() { + HPX_ASSERT(this->get_id()); + + typedef server::buffer::get_device_id_action action_type; + return hpx::async(this->get_id()); + } + + /** + * \brief Method to access the buffer's device ID + * \return The ID of the device where the buffer is allocated + */ + int get_device_id_sync() { return get_device_id().get(); } + + /** + * \brief Method copy the data on the attached device to the host + * \param offset Offset, where to start copying data + * \param size Size of the data on the device + * \return A future with the serialized data + * + * \note This method is for accessing data on remote localities. + */ + + hpx::future> enqueue_read( + size_t offset, size_t size) { + HPX_ASSERT(this->get_id()); + + typedef server::buffer::enqueue_read_action action_type; + return hpx::async(this->get_id(), offset, size); + } + + /** + * \brief Method copy the data on the attached device to the host + * \param offset Offset, where to start copying data + * \param size Size of the data on the device + * \return A future with the uintptr_t to the data + * + * \note This method is for accessing data on local localities. + */ + + hpx::future enqueue_read_local(size_t offset, size_t size) { + HPX_ASSERT(this->get_id()); + + typedef server::buffer::enqueue_read_local_action action_type; + return hpx::async(this->get_id(), offset, size); + } + + /** + * \brief Method copies the provided data on the attached device memory + * \param offset Offset, where to start copying data + * \param size Size of the data on the device + * \param data Pointer to the data, which is transfered to the device + */ + + hpx::future enqueue_write(size_t offset, size_t size, + const void* data) const { + HPX_ASSERT(this->get_id()); + + if (is_local) { + typedef server::buffer::enqueue_write_local_action action_type; + return hpx::async(this->get_id(), offset, size, + reinterpret_cast(data)); + + } else { + hpx::serialization::serialize_buffer serializable_data( + (char*)data, size, + hpx::serialization::serialize_buffer::init_mode::reference); + + typedef server::buffer::enqueue_write_action action_type; + return hpx::async(this->get_id(), offset, size, + serializable_data); } - - /** - * \brief Method to access the buffer's device ID - * \return The ID of the device where the buffer is allocated - */ - int get_device_id_sync(){ - return get_device_id().get(); - } - - /** - * \brief Method copy the data on the attached device to the host - * \param offset Offset, where to start copying data - * \param size Size of the data on the device - * \return A future with the serialized data - * - * \note This method is for accessing data on remote localities. - */ - - hpx::future> enqueue_read( - size_t offset, size_t size) { - HPX_ASSERT(this->get_id()); - - typedef server::buffer::enqueue_read_action action_type; - return hpx::async(this->get_id(), offset, size); - - } - - /** - * \brief Method copy the data on the attached device to the host - * \param offset Offset, where to start copying data - * \param size Size of the data on the device - * \return A future with the uintptr_t to the data - * - * \note This method is for accessing data on local localities. - */ - - hpx::future enqueue_read_local(size_t offset, size_t size) { - HPX_ASSERT(this->get_id()); - - typedef server::buffer::enqueue_read_local_action action_type; - return hpx::async(this->get_id(), offset, size); - - } - - /** - * \brief Method copies the provided data on the attached device memory - * \param offset Offset, where to start copying data - * \param size Size of the data on the device - * \param data Pointer to the data, which is transfered to the device - */ - - hpx::future enqueue_write(size_t offset, size_t size, - const void* data) const { - HPX_ASSERT(this->get_id()); - - if (is_local) { - - typedef server::buffer::enqueue_write_local_action action_type; - return hpx::async(this->get_id(), offset, size, - reinterpret_cast(data)); - - } else { - - hpx::serialization::serialize_buffer serializable_data( - (char*) data, size, - hpx::serialization::serialize_buffer::init_mode::reference); - - typedef server::buffer::enqueue_write_action action_type; - return hpx::async(this->get_id(), offset, size, - serializable_data); - } - - } - + } - hpx::lcos::future p2p_copy(uintptr_t dst, size_t dst_parent_device_id, size_t count){ - - HPX_ASSERT(this->get_id()); - - typedef server::buffer::p2p_copy_action action_type; - return hpx::async(this->get_id(), dst, dst_parent_device_id, count); - } + hpx::lcos::future p2p_copy(uintptr_t dst, size_t dst_parent_device_id, + size_t count) { + HPX_ASSERT(this->get_id()); -private: - hpx::naming::id_type device_gid; - bool is_local; + typedef server::buffer::p2p_copy_action action_type; + return hpx::async(this->get_id(), dst, dst_parent_device_id, + count); + } + private: + hpx::naming::id_type device_gid; + bool is_local; }; -} -} -#endif //BUFFER_1_HPP +} // namespace cuda +} // namespace hpx +#endif // BUFFER_1_HPP diff --git a/cuda/cuda_error_handling.cpp b/cuda/cuda_error_handling.cpp index 73488305..a34cd544 100644 --- a/cuda/cuda_error_handling.cpp +++ b/cuda/cuda_error_handling.cpp @@ -5,24 +5,19 @@ #include "cuda/cuda_error_handling.hpp" -namespace hpx { namespace cuda -{ +namespace hpx { +namespace cuda { void checkCudaError(char const* function) { - - cudaError_t err = cudaGetLastError(); - if (cudaSuccess != err) { - - std::stringstream errorMessage; - errorMessage << "CudaError: " << cudaGetErrorString(err) << " at " << function << std::endl; - - HPX_THROW_EXCEPTION(hpx::no_success, function, - errorMessage.str().c_str()); - - } - + cudaError_t err = cudaGetLastError(); + if (cudaSuccess != err) { + std::stringstream errorMessage; + errorMessage << "CudaError: " << cudaGetErrorString(err) << " at " + << function << std::endl; + + HPX_THROW_EXCEPTION(hpx::no_success, function, errorMessage.str().c_str()); + } } -}} - - +} // namespace cuda +} // namespace hpx diff --git a/cuda/cuda_error_handling.hpp b/cuda/cuda_error_handling.hpp index 9497c34e..acae44bf 100644 --- a/cuda/cuda_error_handling.hpp +++ b/cuda/cuda_error_handling.hpp @@ -15,8 +15,8 @@ #include #include -namespace hpx { namespace cuda -{ +namespace hpx { +namespace cuda { /** \brief Handles the error checking for CUDA functions calls * and kernel executions @@ -34,6 +34,7 @@ namespace hpx { namespace cuda */ HPX_CUDA_EXPORT void checkCudaError(char const* function_name = ""); -}} +} // namespace cuda +} // namespace hpx #endif /* CUDA_CUDA_CUDAERRORHANDLING_HPP_ */ diff --git a/cuda/device.cpp b/cuda/device.cpp index fcfa0c61..5174c2db 100644 --- a/cuda/device.cpp +++ b/cuda/device.cpp @@ -7,15 +7,13 @@ #include "cuda/device.hpp" -typedef hpx::components::managed_component< - hpx::cuda::server::device - > cuda_device_type; +typedef hpx::components::managed_component + cuda_device_type; HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(cuda_device_type, cuda_device); -HPX_REGISTER_ACTION( - cuda_device_type::wrapped_type::get_cuda_info_action, - cuda_device_get_cuda_info_action); +HPX_REGISTER_ACTION(cuda_device_type::wrapped_type::get_cuda_info_action, + cuda_device_get_cuda_info_action); HPX_REGISTER_ACTION( cuda_device_type::wrapped_type::get_extended_cuda_info_action, cuda_device_get_extended_cuda_info_action); @@ -25,21 +23,16 @@ HPX_REGISTER_ACTION( HPX_REGISTER_ACTION( cuda_device_type::wrapped_type::get_device_architecture_minor_action, cuda_device_get_device_architecture_minor_action); -HPX_REGISTER_ACTION( - cuda_device_type::wrapped_type::set_device_action, - cuda_device_set_device_action); -HPX_REGISTER_ACTION( - cuda_device_type::wrapped_type::get_all_devices_action, - cuda_device_get_all_devices_action); -HPX_REGISTER_ACTION( - cuda_device_type::wrapped_type::get_device_id_action, - cuda_device_get_device_id_aciton); -HPX_REGISTER_ACTION( - cuda_device_type::wrapped_type::get_context_action, - cuda_device_get_context_action); +HPX_REGISTER_ACTION(cuda_device_type::wrapped_type::set_device_action, + cuda_device_set_device_action); +HPX_REGISTER_ACTION(cuda_device_type::wrapped_type::get_all_devices_action, + cuda_device_get_all_devices_action); +HPX_REGISTER_ACTION(cuda_device_type::wrapped_type::get_device_id_action, + cuda_device_get_device_id_aciton); +HPX_REGISTER_ACTION(cuda_device_type::wrapped_type::get_context_action, + cuda_device_get_context_action); HPX_REGISTER_ACTION( cuda_device_type::wrapped_type::create_program_with_source_action, create_program_with_source_action); -HPX_REGISTER_ACTION( - cuda_device_type::wrapped_type::create_buffer_action, - create_buffer_action); +HPX_REGISTER_ACTION(cuda_device_type::wrapped_type::create_buffer_action, + create_buffer_action); diff --git a/cuda/device.hpp b/cuda/device.hpp index d45cadf1..df509de8 100644 --- a/cuda/device.hpp +++ b/cuda/device.hpp @@ -18,193 +18,184 @@ namespace hpx { namespace cuda { /** -* \brief An CUDA accelerator device. -*/ -class device: public hpx::components::client_base { - typedef hpx::components::client_base base_type; - -public: - device() { - } - - device(hpx::naming::id_type const& there, int dev) : - base_type(hpx::new_(there, dev)) { - } - - device(hpx::future && gid) : - base_type(std::move(gid)) { - } - - /** - * \brief Method prints the properties of this device - */ - void get_cuda_info() { - HPX_ASSERT(this->get_id()); - typedef server::device::get_cuda_info_action action_type; - hpx::apply(this->get_id()); - } - - /** - * \brief Method prints the extended properties of this device - * - * \note All information of the cudaDeviceproperties are shown. - */ - void get_extended_cuda_info() { - HPX_ASSERT(this->get_id()); - typedef server::device::get_extended_cuda_info_action action_type; - hpx::apply(this->get_id()); - } - - /** - * \brief methods return the major compute capability of this device - * \return Major compute capability of this device - */ - hpx::lcos::future get_device_architecture_major() { - HPX_ASSERT(this->get_id()); - typedef server::device::get_device_architecture_major_action action_type; - return hpx::async(this->get_id()); - } - - /** - * \brief methods return the minor compute capability of this device - * \return Minor compute capability of this device - */ - hpx::lcos::future get_device_architecture_minor() { - HPX_ASSERT(this->get_id()); - typedef server::device::get_device_architecture_minor_action action_type; - return hpx::async(this->get_id()); - } - - /** - * \brief Method returns all other devices on the locality of this device - * \return A list of suitable CUDA devices on target node - */ - - static std::vector get_all_devices( - std::vector localities) { - int num = 0; - std::vector devices; - typedef server::device::get_all_devices_action action_type; - for (size_t i = 0; i < localities.size(); i++) { - num += hpx::async(localities[i]).get(); - for (int i = 0; i < num; i++) { - devices.push_back(i); - } - } - return devices; - } - - void set_device(int dev) { - HPX_ASSERT(this->get_id()); - typedef server::device::set_device_action action_type; - hpx::async(this->get_id(), dev); - } - - hpx::lcos::future get_device_id() { - HPX_ASSERT(this->get_id()); - typedef server::device::get_device_id_action action_type; - return hpx::async(this->get_id()); - } - - int get_device_id_sync() { - HPX_ASSERT(this->get_id()); - return get_device_id().get(); - } - - hpx::lcos::future get_context() { - HPX_ASSERT(this->get_id()); - typedef server::device::get_context_action action_type; - return hpx::async(this->get_id()); - } - - int get_context_sync() { - - return get_context().get(); - } - - hpx::lcos::future wait() { - return server::device::wait(); - } - - /** - * \brief Creates synchronous a program with the set source code - */ - //hpx::cuda::program create_program_with_source_sync(std::string source) { - // return create_program_with_source(source).get(); - //} - - /** - * \brief Creates a program with the set source code - * - * This Method creates a program containing the source code. - * This program is attached to this device and all streams are - * executed there. - * - * \param source The source code of the CUDA kernel - * - * \see Program - */ - hpx::lcos::future create_program_with_source( - std::string source) { - HPX_ASSERT(this->get_id()); - typedef server::device::create_program_with_source_action action_type; - return hpx::async(this->get_id(), source); - } - - /** - * \brief Creates a program from the given source file - * - * This Method creates a program containing the source code. - * This program is attached to this device and all streams are - * executed there. - * - * \param source The path to the source file - * - * \see Program - */ - hpx::lcos::future create_program_with_file( - std::string file) { - HPX_ASSERT(this->get_id()); - - std::string source; - std::string tmp; - std::ifstream in(file); - - if (!in) { - std::string errorMessage = "File "; - errorMessage += file; - errorMessage += " not found!"; - HPX_THROW_EXCEPTION(hpx::no_success, "create_program_with_file", - errorMessage); - } - - while (in) { - getline(in, tmp); - source += tmp; - source += "\n"; - } - - typedef server::device::create_program_with_source_action action_type; - return hpx::async(this->get_id(), source); - } - - /** - * \brief Creates a buffer attached to this device - * - * The method creates a buffer with the size specified here and allocates - * this on this device. - * - * \param size The size of the allocated buffer - * - * \return The buffer with the allocated memoery on this device - */ - hpx::lcos::future create_buffer(size_t size) { - HPX_ASSERT(this->get_id()); - typedef server::device::create_buffer_action action_type; - return hpx::async(this->get_id(), size); - } - + * \brief An CUDA accelerator device. + */ +class device + : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + device() {} + + device(hpx::naming::id_type const& there, int dev) + : base_type(hpx::new_(there, dev)) {} + + device(hpx::future&& gid) : base_type(std::move(gid)) {} + + /** + * \brief Method prints the properties of this device + */ + void get_cuda_info() { + HPX_ASSERT(this->get_id()); + typedef server::device::get_cuda_info_action action_type; + hpx::apply(this->get_id()); + } + + /** + * \brief Method prints the extended properties of this device + * + * \note All information of the cudaDeviceproperties are shown. + */ + void get_extended_cuda_info() { + HPX_ASSERT(this->get_id()); + typedef server::device::get_extended_cuda_info_action action_type; + hpx::apply(this->get_id()); + } + + /** + * \brief methods return the major compute capability of this device + * \return Major compute capability of this device + */ + hpx::lcos::future get_device_architecture_major() { + HPX_ASSERT(this->get_id()); + typedef server::device::get_device_architecture_major_action action_type; + return hpx::async(this->get_id()); + } + + /** + * \brief methods return the minor compute capability of this device + * \return Minor compute capability of this device + */ + hpx::lcos::future get_device_architecture_minor() { + HPX_ASSERT(this->get_id()); + typedef server::device::get_device_architecture_minor_action action_type; + return hpx::async(this->get_id()); + } + + /** + * \brief Method returns all other devices on the locality of this device + * \return A list of suitable CUDA devices on target node + */ + + static std::vector get_all_devices( + std::vector localities) { + int num = 0; + std::vector devices; + typedef server::device::get_all_devices_action action_type; + for (size_t i = 0; i < localities.size(); i++) { + num += hpx::async(localities[i]).get(); + for (int i = 0; i < num; i++) { + devices.push_back(i); + } + } + return devices; + } + + void set_device(int dev) { + HPX_ASSERT(this->get_id()); + typedef server::device::set_device_action action_type; + hpx::async(this->get_id(), dev); + } + + hpx::lcos::future get_device_id() { + HPX_ASSERT(this->get_id()); + typedef server::device::get_device_id_action action_type; + return hpx::async(this->get_id()); + } + + int get_device_id_sync() { + HPX_ASSERT(this->get_id()); + return get_device_id().get(); + } + + hpx::lcos::future get_context() { + HPX_ASSERT(this->get_id()); + typedef server::device::get_context_action action_type; + return hpx::async(this->get_id()); + } + + int get_context_sync() { return get_context().get(); } + + hpx::lcos::future wait() { return server::device::wait(); } + + /** + * \brief Creates synchronous a program with the set source code + */ + // hpx::cuda::program create_program_with_source_sync(std::string source) { + // return create_program_with_source(source).get(); + //} + + /** + * \brief Creates a program with the set source code + * + * This Method creates a program containing the source code. + * This program is attached to this device and all streams are + * executed there. + * + * \param source The source code of the CUDA kernel + * + * \see Program + */ + hpx::lcos::future create_program_with_source( + std::string source) { + HPX_ASSERT(this->get_id()); + typedef server::device::create_program_with_source_action action_type; + return hpx::async(this->get_id(), source); + } + + /** + * \brief Creates a program from the given source file + * + * This Method creates a program containing the source code. + * This program is attached to this device and all streams are + * executed there. + * + * \param source The path to the source file + * + * \see Program + */ + hpx::lcos::future create_program_with_file( + std::string file) { + HPX_ASSERT(this->get_id()); + + std::string source; + std::string tmp; + std::ifstream in(file); + + if (!in) { + std::string errorMessage = "File "; + errorMessage += file; + errorMessage += " not found!"; + HPX_THROW_EXCEPTION(hpx::no_success, "create_program_with_file", + errorMessage); + } + + while (in) { + getline(in, tmp); + source += tmp; + source += "\n"; + } + + typedef server::device::create_program_with_source_action action_type; + return hpx::async(this->get_id(), source); + } + + /** + * \brief Creates a buffer attached to this device + * + * The method creates a buffer with the size specified here and allocates + * this on this device. + * + * \param size The size of the allocated buffer + * + * \return The buffer with the allocated memoery on this device + */ + hpx::lcos::future create_buffer(size_t size) { + HPX_ASSERT(this->get_id()); + typedef server::device::create_buffer_action action_type; + return hpx::async(this->get_id(), size); + } }; -} -} -#endif //MANAGED_CUDA_COMPONENT_1_HPP +} // namespace cuda +} // namespace hpx +#endif // MANAGED_CUDA_COMPONENT_1_HPP diff --git a/cuda/export_definitions.hpp b/cuda/export_definitions.hpp index 30af3ac5..4861cc8b 100644 --- a/cuda/export_definitions.hpp +++ b/cuda/export_definitions.hpp @@ -10,9 +10,9 @@ #include #if defined(HPX_CUDA_MODULE_EXPORTS) -# define HPX_CUDA_EXPORT HPX_SYMBOL_EXPORT +#define HPX_CUDA_EXPORT HPX_SYMBOL_EXPORT #else -# define HPX_CUDA_EXPORT HPX_SYMBOL_IMPORT +#define HPX_CUDA_EXPORT HPX_SYMBOL_IMPORT #endif -#endif //HPX_CUDA_EXPORT_DEFINITIONS_HPP_ +#endif // HPX_CUDA_EXPORT_DEFINITIONS_HPP_ diff --git a/cuda/fwd_declarations.hpp b/cuda/fwd_declarations.hpp index e197c5da..02033a48 100644 --- a/cuda/fwd_declarations.hpp +++ b/cuda/fwd_declarations.hpp @@ -8,21 +8,18 @@ #include "export_definitions.hpp" -namespace hpx -{ - namespace cuda - { - class device; - class buffer; - class program; +namespace hpx { +namespace cuda { +class device; +class buffer; +class program; - namespace server - { - class HPX_CUDA_EXPORT device; - class HPX_CUDA_EXPORT buffer; - class HPX_CUDA_EXPORT program; - } - } -} +namespace server { +class HPX_CUDA_EXPORT device; +class HPX_CUDA_EXPORT buffer; +class HPX_CUDA_EXPORT program; +} // namespace server +} // namespace cuda +} // namespace hpx #endif diff --git a/cuda/get_devices.cpp b/cuda/get_devices.cpp index f9738fc4..2deddd39 100644 --- a/cuda/get_devices.cpp +++ b/cuda/get_devices.cpp @@ -10,117 +10,90 @@ #include -static hpx::future > get_devices_on_nodes( - std::vector && localities, int major, int minor) { - - // query all devices - std::vector < hpx::future>> - locality_device_futures; - for (auto &locality : localities) { - - // get all devices on locality - hpx::future > locality_device_future = - hpx::cuda::get_devices(locality, major, minor); - - // add locality device future to list of futures - locality_device_futures.push_back(std::move(locality_device_future)); - - } - - // combine futures - hpx::future - < std::vector > > - > combined_locality_device_future = hpx::when_all( - locality_device_futures); - - // create result future - hpx::future> result_future = - combined_locality_device_future.then( - hpx::util::bind( - - // define combining function inline - [] ( - hpx::future< std::vector< - hpx::future< std::vector< hpx::cuda::device > > - > > parent_future - ) -> std::vector< hpx::cuda::device > - { - - // initialize the result list - std::vector< hpx::cuda::device > devices; - - // get vector from parent future - std::vector< hpx::future< - std::vector< hpx::cuda::device > - > > locality_device_futures = parent_future.get(); - - // for each future, take devices out and join in one list - for(auto &locality_device_future : locality_device_futures) - { - - // wait for device query to finish - std::vector locality_devices = - locality_device_future.get(); - - // add all devices to device list - devices.insert(devices.end(), locality_devices.begin(), - locality_devices.end()); - - } - - return devices; - - }, - - hpx::util::placeholders::_1 - - )); - - // return the future to the device list - return result_future; - +static hpx::future> get_devices_on_nodes( + std::vector &&localities, int major, int minor) { + // query all devices + std::vector>> + locality_device_futures; + for (auto &locality : localities) { + // get all devices on locality + hpx::future> locality_device_future = + hpx::cuda::get_devices(locality, major, minor); + + // add locality device future to list of futures + locality_device_futures.push_back(std::move(locality_device_future)); + } + + // combine futures + hpx::future>>> + combined_locality_device_future = hpx::when_all(locality_device_futures); + + // create result future + hpx::future> result_future = + combined_locality_device_future.then(hpx::util::bind( + + // define combining function inline + [](hpx::future< + std::vector>>> + parent_future) -> std::vector { + // initialize the result list + std::vector devices; + + // get vector from parent future + std::vector>> + locality_device_futures = parent_future.get(); + + // for each future, take devices out and join in one list + for (auto &locality_device_future : locality_device_futures) { + // wait for device query to finish + std::vector locality_devices = + locality_device_future.get(); + + // add all devices to device list + devices.insert(devices.end(), locality_devices.begin(), + locality_devices.end()); + } + + return devices; + }, + + hpx::util::placeholders::_1 + + )); + + // return the future to the device list + return result_future; } hpx::future> hpx::cuda::get_devices( - hpx::naming::id_type node_id, int major, int minor) { - - typedef hpx::cuda::server::get_devices_action action; - return async < action > (node_id, major, minor); - + hpx::naming::id_type node_id, int major, int minor) { + typedef hpx::cuda::server::get_devices_action action; + return async(node_id, major, minor); } hpx::future> hpx::cuda::get_local_devices( - int major, int minor) { - - // get local locality id - hpx::naming::id_type locality = hpx::find_here(); - - // find devices on localities - return get_devices(locality, major, minor); + int major, int minor) { + // get local locality id + hpx::naming::id_type locality = hpx::find_here(); + // find devices on localities + return get_devices(locality, major, minor); } hpx::future> hpx::cuda::get_remote_devices( - int major, int minor) { - - // get remote HPX localities - std::vector < hpx::naming::id_type > localities = - hpx::find_remote_localities(); - - // find devices on localities - return get_devices_on_nodes(std::move(localities), major, minor); + int major, int minor) { + // get remote HPX localities + std::vector localities = hpx::find_remote_localities(); + // find devices on localities + return get_devices_on_nodes(std::move(localities), major, minor); } hpx::future> hpx::cuda::get_all_devices( - int major, int minor) { - - // get all HPX localities - std::vector < hpx::naming::id_type > localities = - hpx::find_all_localities(); - - // find devices on localities - return get_devices_on_nodes(std::move(localities), major, minor); + int major, int minor) { + // get all HPX localities + std::vector localities = hpx::find_all_localities(); + // find devices on localities + return get_devices_on_nodes(std::move(localities), major, minor); } - diff --git a/cuda/get_devices.hpp b/cuda/get_devices.hpp index 1326a88a..8ffc9b4f 100644 --- a/cuda/get_devices.hpp +++ b/cuda/get_devices.hpp @@ -28,8 +28,8 @@ namespace cuda { * * \return A list of suitable CUDA devices on target node */ -HPX_CUDA_EXPORT hpx::future > -get_devices(hpx::naming::id_type node_id, int major = 1, int minor = 0); +HPX_CUDA_EXPORT hpx::future> get_devices( + hpx::naming::id_type node_id, int major = 1, int minor = 0); /** * \brief Fetches a list of all accelerator devices present in the current @@ -40,8 +40,8 @@ get_devices(hpx::naming::id_type node_id, int major = 1, int minor = 0); * * \return A list of suitable CUDA devices */ -HPX_CUDA_EXPORT hpx::future> -get_all_devices(int major = 1, int minor = 0); +HPX_CUDA_EXPORT hpx::future> get_all_devices(int major = 1, + int minor = 0); /** * \brief Fetches a list of local accelerator devices present in the current @@ -52,8 +52,8 @@ get_all_devices(int major = 1, int minor = 0); * * \return A list of suitable CUDA devices */ -HPX_CUDA_EXPORT hpx::future> -get_local_devices(int major = 1, int minor = 0); +HPX_CUDA_EXPORT hpx::future> get_local_devices( + int major = 1, int minor = 0); /** * \brief Fetches a list of remote accelerator devices present in the current @@ -64,9 +64,9 @@ get_local_devices(int major = 1, int minor = 0); * * \return A list of suitable CUDA devices */ -HPX_CUDA_EXPORT hpx::future> -get_remote_devices(int major = 1, int minor = 0); -} -} +HPX_CUDA_EXPORT hpx::future> get_remote_devices( + int major = 1, int minor = 0); +} // namespace cuda +} // namespace hpx #endif diff --git a/cuda/program.cpp b/cuda/program.cpp index ad83184a..9823055e 100644 --- a/cuda/program.cpp +++ b/cuda/program.cpp @@ -7,22 +7,21 @@ #include "cuda/program.hpp" -typedef hpx::components::managed_component< - hpx::cuda::server::program> +typedef hpx::components::managed_component cuda_program_type; HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(cuda_program_type, cuda_program); HPX_REGISTER_ACTION(cuda_program_type::wrapped_type::build_action, - cuda_program_build_action); + cuda_program_build_action); HPX_REGISTER_ACTION(cuda_program_type::wrapped_type::set_source_action, - cuda_program_set_source_action); + cuda_program_set_source_action); HPX_REGISTER_ACTION(cuda_program_type::wrapped_type::run_action, - cuda_program_run_action); + cuda_program_run_action); #ifdef HPXCL_CUDA_WITH_STREAMS HPX_REGISTER_ACTION(cuda_program_type::wrapped_type::get_streams_size_action, - cuda_get_streams_size_action); + cuda_get_streams_size_action); HPX_REGISTER_ACTION(cuda_program_type::wrapped_type::create_stream_action, - cuda_create_stream_action); + cuda_create_stream_action); #endif diff --git a/cuda/program.hpp b/cuda/program.hpp index 6f535561..03855cd9 100644 --- a/cuda/program.hpp +++ b/cuda/program.hpp @@ -27,242 +27,237 @@ namespace cuda { * this program are pinned to this streams. * */ -class program: public hpx::components::client_base { - typedef hpx::components::client_base base_type; - -public: - - program() { - } - - program(hpx::future && gid) : - base_type(std::move(gid)) { - } - - /** \brief This method compiles the set source code - * - * This methods uses the nvrtc library to compile the CUDA source code. You - * should use this method only for compiling small CUDA kernels for testing - * or for only accelerating small parts of your existing code. - * - * \param compilerFlags A list with all compiler flags passed to - * the nvcc compiler - * \param modulename Name of the kernel, which to be compiled - * \param debug Compile with debug flags - * - * \note It is not possible to use included headers. - * \note Compiling a program in debug modus adds -G and - * -lineinfo to the nvcc compiler flags - * - */ - - hpx::lcos::future build(std::vector compilerFlags, - std::vector modulenames, unsigned int debug = 0) { - HPX_ASSERT(this->get_id()); - typedef server::program::build_action action_type; - return hpx::async(this->get_id(), compilerFlags, - modulenames, debug); - } - - /** - * \brief Synchronous compilation of the source code - */ - - void build_sync(std::vector compilerFlags, - std::string modulename, unsigned int debug = 0) { - std::vector modulenames; - modulenames.push_back(modulename); - build(compilerFlags, modulenames, debug).get(); - } - - hpx::lcos::future build(std::vector compilerFlags, - std::string modulename, unsigned int debug = 0) { - HPX_ASSERT(this->get_id()); - std::vector modulenames; - modulenames.push_back(modulename); - typedef server::program::build_action action_type; - return hpx::async(this->get_id(), compilerFlags, - modulenames, debug); - } - - /** - * \brief Synchronous compilation of the source code - */ - - void build_sync(std::vector compilerFlags, - std::vector modulenames, unsigned int debug = 0) { - build(compilerFlags, modulenames, debug).get(); - } - - /** - * \brief Synchronous setting source code - */ - void set_source_sync(std::string source) { - HPX_ASSERT(this->get_id()); - typedef server::program::set_source_action action_type; - hpx::async(this->get_id(), source).get(); - } +class program : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + program() {} + + program(hpx::future&& gid) + : base_type(std::move(gid)) {} + + /** \brief This method compiles the set source code + * + * This methods uses the nvrtc library to compile the CUDA source code. You + * should use this method only for compiling small CUDA kernels for testing + * or for only accelerating small parts of your existing code. + * + * \param compilerFlags A list with all compiler flags passed to + * the nvcc compiler + * \param modulename Name of the kernel, which to be compiled + * \param debug Compile with debug flags + * + * \note It is not possible to use included headers. + * \note Compiling a program in debug modus adds -G and + * -lineinfo to the nvcc compiler flags + * + */ + + hpx::lcos::future build(std::vector compilerFlags, + std::vector modulenames, + unsigned int debug = 0) { + HPX_ASSERT(this->get_id()); + typedef server::program::build_action action_type; + return hpx::async(this->get_id(), compilerFlags, modulenames, + debug); + } + + /** + * \brief Synchronous compilation of the source code + */ + + void build_sync(std::vector compilerFlags, + std::string modulename, unsigned int debug = 0) { + std::vector modulenames; + modulenames.push_back(modulename); + build(compilerFlags, modulenames, debug).get(); + } + + hpx::lcos::future build(std::vector compilerFlags, + std::string modulename, + unsigned int debug = 0) { + HPX_ASSERT(this->get_id()); + std::vector modulenames; + modulenames.push_back(modulename); + typedef server::program::build_action action_type; + return hpx::async(this->get_id(), compilerFlags, modulenames, + debug); + } + + /** + * \brief Synchronous compilation of the source code + */ + + void build_sync(std::vector compilerFlags, + std::vector modulenames, + unsigned int debug = 0) { + build(compilerFlags, modulenames, debug).get(); + } + + /** + * \brief Synchronous setting source code + */ + void set_source_sync(std::string source) { + HPX_ASSERT(this->get_id()); + typedef server::program::set_source_action action_type; + hpx::async(this->get_id(), source).get(); + } #ifdef HPXCL_CUDA_WITH_STREAMS - /** - * \brief This method executes the kernel, compiled or set to this program - * - * \param modulename The name of the kernel - * \param args The function arguments passed to the kernel - * \param grid The dimensions of the grid size - * \param block The dimensions of the block size - * \param stream The stream at which the kernel is attached to - * - * \note Each program has a default stream, which is not the same as the - * default stream of the CUDA API. Not setting the last parameter - * implies that the kernel is executed on the default stream - * of this program. - */ - - hpx::lcos::future run(std::vector args, - std::string modulename, hpx::cuda::server::program::Dim3 grid, - hpx::cuda::server::program::Dim3 block, size_t shared_memory, int stream = -1) { - - HPX_ASSERT(this->get_id()); - - std::vector args_id; - - for (unsigned int i = 0; i < args.size(); i++) { - - args_id.push_back(args[i].get_id()); - } - - std::vector dependencies; - - typedef server::program::run_action action_type; - return hpx::async(this->get_id(), args_id, modulename, - grid, block, dependencies, shared_memory, stream); - - } - - /** - * \brief This method executes the kernel, compiled or set to this program - * - * \param modulename The name of the kernel - * \param args The function arguments passed to the kernel - * \param grid The dimensions of the grid size - * \param block The dimensions of the block size - * \param dependencies The data, the kernel execution depends on - * \param stream The stream at which the kernel is attached to - * - * \note Each program has a default stream, which is not the same - * as the default stream of the CUDA API. Not setting the last parameter - * implies that the kernel is executed on the default stream. - */ - - hpx::lcos::future run(std::vector args, - std::string modulename, hpx::cuda::server::program::Dim3 grid, - hpx::cuda::server::program::Dim3 block, - std::vector dependencies, size_t shared_memory, int stream = -1) { - - HPX_ASSERT(this->get_id()); - - std::vector args_id; - - for (unsigned int i = 0; i < args.size(); i++) { - - args_id.push_back(args[i].get_id()); - } - - std::vector dependencies_id; - - for (unsigned int i = 0; i < dependencies.size(); i++) { - - dependencies_id.push_back(dependencies[i].get_id()); - } - - typedef server::program::run_action action_type; - return hpx::async(this->get_id(), args_id, modulename, - grid, block, dependencies_id, shared_memory, stream); - - } - - hpx::lcos::future run(std::vector args, - std::string modulename, hpx::cuda::server::program::Dim3 grid, - hpx::cuda::server::program::Dim3 block, - hpx::cuda::buffer dependency, size_t shared_memory, int stream = -1) { - - HPX_ASSERT(this->get_id()); - - std::vector args_id; - - for (unsigned int i = 0; i < args.size(); i++) { - - args_id.push_back(args[i].get_id()); - } - - std::vector dependencies_id; - dependencies_id.push_back(dependency.get_id()); - - typedef server::program::run_action action_type; - return hpx::async(this->get_id(), args_id, modulename, - grid, block, dependencies_id, shared_memory, stream); - - } - - /** - * \brief This method returns the number of streams at this device - * - * \return Future containing the number of streams created on the device - * - */ - - hpx::lcos::future get_streams_size() { - HPX_ASSERT(this->get_id()); - typedef server::program::get_streams_size_action action_type; - return hpx::async(this->get_id()); - - } - - /** - * \brief This method returns the id of this created stream at the device - * - * \return Future containing the streams id - * - */ - - hpx::lcos::future create_stream() { - HPX_ASSERT(this->get_id()); - typedef server::program::create_stream_action action_type; - return hpx::async(this->get_id()); - } + /** + * \brief This method executes the kernel, compiled or set to this program + * + * \param modulename The name of the kernel + * \param args The function arguments passed to the kernel + * \param grid The dimensions of the grid size + * \param block The dimensions of the block size + * \param stream The stream at which the kernel is attached to + * + * \note Each program has a default stream, which is not the same as the + * default stream of the CUDA API. Not setting the last parameter + * implies that the kernel is executed on the default stream + * of this program. + */ + + hpx::lcos::future run(std::vector args, + std::string modulename, + hpx::cuda::server::program::Dim3 grid, + hpx::cuda::server::program::Dim3 block, + size_t shared_memory, int stream = -1) { + HPX_ASSERT(this->get_id()); + + std::vector args_id; + + for (unsigned int i = 0; i < args.size(); i++) { + args_id.push_back(args[i].get_id()); + } + + std::vector dependencies; + + typedef server::program::run_action action_type; + return hpx::async(this->get_id(), args_id, modulename, grid, + block, dependencies, shared_memory, stream); + } + + /** + * \brief This method executes the kernel, compiled or set to this program + * + * \param modulename The name of the kernel + * \param args The function arguments passed to the kernel + * \param grid The dimensions of the grid size + * \param block The dimensions of the block size + * \param dependencies The data, the kernel execution depends on + * \param stream The stream at which the kernel is attached to + * + * \note Each program has a default stream, which is not the same + * as the default stream of the CUDA API. Not setting the last parameter + * implies that the kernel is executed on the default stream. + */ + + hpx::lcos::future run(std::vector args, + std::string modulename, + hpx::cuda::server::program::Dim3 grid, + hpx::cuda::server::program::Dim3 block, + std::vector dependencies, + size_t shared_memory, int stream = -1) { + HPX_ASSERT(this->get_id()); + + std::vector args_id; + + for (unsigned int i = 0; i < args.size(); i++) { + args_id.push_back(args[i].get_id()); + } + + std::vector dependencies_id; + + for (unsigned int i = 0; i < dependencies.size(); i++) { + dependencies_id.push_back(dependencies[i].get_id()); + } + + typedef server::program::run_action action_type; + return hpx::async(this->get_id(), args_id, modulename, grid, + block, dependencies_id, shared_memory, + stream); + } + + hpx::lcos::future run(std::vector args, + std::string modulename, + hpx::cuda::server::program::Dim3 grid, + hpx::cuda::server::program::Dim3 block, + hpx::cuda::buffer dependency, + size_t shared_memory, int stream = -1) { + HPX_ASSERT(this->get_id()); + + std::vector args_id; + + for (unsigned int i = 0; i < args.size(); i++) { + args_id.push_back(args[i].get_id()); + } + + std::vector dependencies_id; + dependencies_id.push_back(dependency.get_id()); + + typedef server::program::run_action action_type; + return hpx::async(this->get_id(), args_id, modulename, grid, + block, dependencies_id, shared_memory, + stream); + } + + /** + * \brief This method returns the number of streams at this device + * + * \return Future containing the number of streams created on the device + * + */ + + hpx::lcos::future get_streams_size() { + HPX_ASSERT(this->get_id()); + typedef server::program::get_streams_size_action action_type; + return hpx::async(this->get_id()); + } + + /** + * \brief This method returns the id of this created stream at the device + * + * \return Future containing the streams id + * + */ + + hpx::lcos::future create_stream() { + HPX_ASSERT(this->get_id()); + typedef server::program::create_stream_action action_type; + return hpx::async(this->get_id()); + } #else - /** - * - *\brief This method executes the kernel set to the program - * - *\param args List of arguments to the kernel - *\param modulename Name of the kernel program to be executed - *\param grid Grid dimension of the kernel - *\param block Block dimension of the kernel - * - *\note The kernel is executed the null stream provided by CUDA API - */ - hpx::lcos::future run(std::vector args, - std::string modulename, hpx::cuda::server::program::Dim3 grid, - hpx::cuda::server::program::Dim3 block) { - - HPX_ASSERT(this->get_id()); - - std::vector args_id; - - for (unsigned int i = 0; i < args.size(); i++) { - - args_id.push_back(args[i].get_id()); - } - - typedef server::program::run_action action_type; - return hpx::async(this->get_id(), args_id, modulename, - grid, block); - } + /** + * + *\brief This method executes the kernel set to the program + * + *\param args List of arguments to the kernel + *\param modulename Name of the kernel program to be executed + *\param grid Grid dimension of the kernel + *\param block Block dimension of the kernel + * + *\note The kernel is executed the null stream provided by CUDA API + */ + hpx::lcos::future run(std::vector args, + std::string modulename, + hpx::cuda::server::program::Dim3 grid, + hpx::cuda::server::program::Dim3 block) { + HPX_ASSERT(this->get_id()); + + std::vector args_id; + + for (unsigned int i = 0; i < args.size(); i++) { + args_id.push_back(args[i].get_id()); + } + + typedef server::program::run_action action_type; + return hpx::async(this->get_id(), args_id, modulename, grid, + block); + } #endif - }; -} -} -#endif //PROGRAM_1_HPP +} // namespace cuda +} // namespace hpx +#endif // PROGRAM_1_HPP diff --git a/cuda/server/buffer.hpp b/cuda/server/buffer.hpp index 3fdc2b0c..c866a554 100644 --- a/cuda/server/buffer.hpp +++ b/cuda/server/buffer.hpp @@ -16,104 +16,92 @@ #include "cuda/fwd_declarations.hpp" #include "cuda/export_definitions.hpp" #include "cuda/cuda_error_handling.hpp" -namespace hpx -{ - namespace cuda - { - namespace server - { - ////////////////////////////////////////////////////////// - ///This class represents a buffer of cuda kernel arguments - - class HPX_CUDA_EXPORT buffer - : public hpx::components::locking_hook< - hpx::components::managed_component_base - > - { - - private: - size_t arg_buffer_size; - int parent_device_num; - void* data_device; - - //New stream if defined - #ifdef HPXCL_CUDA_WITH_STREAMS - cudaStream_t stream; - #endif - - public: - buffer(); - - buffer(size_t size, int parent_device_num); - - size_t size(); - - void set_size(size_t size); - - ~buffer(); - - hpx::serialization::serialize_buffer - enqueue_read(size_t offset, size_t size); - - uintptr_t enqueue_read_local(size_t offset, size_t size); - - void enqueue_write(size_t offset, size_t size, hpx::serialization::serialize_buffer data); - - void enqueue_write_local(size_t offset, size_t size, uintptr_t data); - - uintptr_t get_device_pointer(); - - void* get_raw_pointer(); - - int get_device_id(); - - void p2p_copy(uintptr_t dst, size_t dst_parent_device_id, size_t count); - - #ifdef HPXCL_CUDA_WITH_STREAMS - cudaStream_t get_stream(); - #endif - - HPX_DEFINE_COMPONENT_ACTION(buffer, size); - HPX_DEFINE_COMPONENT_ACTION(buffer, set_size); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_read); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_read_local); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_write); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_write_local); - HPX_DEFINE_COMPONENT_ACTION(buffer, get_device_pointer); - HPX_DEFINE_COMPONENT_ACTION(buffer, get_device_id); - HPX_DEFINE_COMPONENT_ACTION(buffer, p2p_copy); - }; - } - } -} - - HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::buffer::size_action, - buffer_size_action); - HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::buffer::set_size_action, - buffer_set_size_action); - HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::buffer::enqueue_read_action, - buffer_enqueue_read_action); - HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::buffer::enqueue_write_action, - buffer_enqueue_write_action); - HPX_REGISTER_ACTION_DECLARATION( +namespace hpx { +namespace cuda { +namespace server { +////////////////////////////////////////////////////////// +/// This class represents a buffer of cuda kernel arguments + +class HPX_CUDA_EXPORT buffer + : public hpx::components::locking_hook< + hpx::components::managed_component_base > { + private: + size_t arg_buffer_size; + int parent_device_num; + void* data_device; + +// New stream if defined +#ifdef HPXCL_CUDA_WITH_STREAMS + cudaStream_t stream; +#endif + + public: + buffer(); + + buffer(size_t size, int parent_device_num); + + size_t size(); + + void set_size(size_t size); + + ~buffer(); + + hpx::serialization::serialize_buffer enqueue_read(size_t offset, + size_t size); + + uintptr_t enqueue_read_local(size_t offset, size_t size); + + void enqueue_write(size_t offset, size_t size, + hpx::serialization::serialize_buffer data); + + void enqueue_write_local(size_t offset, size_t size, uintptr_t data); + + uintptr_t get_device_pointer(); + + void* get_raw_pointer(); + + int get_device_id(); + + void p2p_copy(uintptr_t dst, size_t dst_parent_device_id, size_t count); + +#ifdef HPXCL_CUDA_WITH_STREAMS + cudaStream_t get_stream(); +#endif + + HPX_DEFINE_COMPONENT_ACTION(buffer, size); + HPX_DEFINE_COMPONENT_ACTION(buffer, set_size); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_read); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_read_local); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_write); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_write_local); + HPX_DEFINE_COMPONENT_ACTION(buffer, get_device_pointer); + HPX_DEFINE_COMPONENT_ACTION(buffer, get_device_id); + HPX_DEFINE_COMPONENT_ACTION(buffer, p2p_copy); +}; +} // namespace server +} // namespace cuda +} // namespace hpx + +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::buffer::size_action, + buffer_size_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::buffer::set_size_action, + buffer_set_size_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::buffer::enqueue_read_action, + buffer_enqueue_read_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::buffer::enqueue_write_action, + buffer_enqueue_write_action); +HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::buffer::enqueue_write_local_action, buffer_enqueue_write_local_action); - HPX_REGISTER_ACTION_DECLARATION( +HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::buffer::enqueue_read_local_action, buffer_enqueue_read_local_action); - HPX_REGISTER_ACTION_DECLARATION( +HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::buffer::get_device_pointer_action, buffer_get_device_pointer_action); - HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::buffer::get_device_id_action, - buffer_get_device_id_action); - HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::buffer::p2p_copy_action, - buffer_p2p_copy_action); - +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::buffer::get_device_id_action, + buffer_get_device_id_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::buffer::p2p_copy_action, + buffer_p2p_copy_action); - #endif //BUFFER_2_HPP +#endif // BUFFER_2_HPP diff --git a/cuda/server/buffer_server.cpp b/cuda/server/buffer_server.cpp index dc2e158a..995a7bf2 100644 --- a/cuda/server/buffer_server.cpp +++ b/cuda/server/buffer_server.cpp @@ -19,238 +19,214 @@ namespace server { /** * Default constructor */ -buffer::buffer() { -} +buffer::buffer() {} buffer::buffer(size_t size, int parent_device_num) { - this->parent_device_num = parent_device_num; - this->arg_buffer_size = size; + this->parent_device_num = parent_device_num; + this->arg_buffer_size = size; - //Set the CUDA device - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer:enqueue_read Set device"); + // Set the CUDA device + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer:enqueue_read Set device"); #ifdef HPXCL_CUDA_WITH_STREAMS - cudaStreamCreate(&stream); - checkCudaError("buffer:buffer Create buffer's stream"); + cudaStreamCreate(&stream); + checkCudaError("buffer:buffer Create buffer's stream"); #endif - cudaMalloc((void**) &data_device, size); - checkCudaError( - "buffer::buffer Error during allocation of the device pointer"); + cudaMalloc((void**)&data_device, size); + checkCudaError( + "buffer::buffer Error during allocation of the device pointer"); } /** * Default destructor */ buffer::~buffer() { + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer::~buffer Error in setting device"); - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer::~buffer Error in setting device"); - - //Synchronize the stream so that all operations are completed before stream is destroyed + // Synchronize the stream so that all operations are completed before stream + // is destroyed #ifdef HPXCL_CUDA_WITH_STREAMS - cudaStreamSynchronize(this->stream); - checkCudaError("buffer::~buffer Error during synchronization of stream"); + cudaStreamSynchronize(this->stream); + checkCudaError("buffer::~buffer Error during synchronization of stream"); #endif - cudaFree (data_device); - checkCudaError("buffer::~buffer Error during free of the device pointer"); + cudaFree(data_device); + checkCudaError("buffer::~buffer Error during free of the device pointer"); - //Destroy the buffer stream created + // Destroy the buffer stream created #ifdef HPXCL_CUDA_WITH_STREAMS - cudaStreamDestroy(stream); - checkCudaError("buffer::~buffer Error during destroying of the stream"); + cudaStreamDestroy(stream); + checkCudaError("buffer::~buffer Error during destroying of the stream"); #endif } /** * Returns the size of the buffer */ -size_t buffer::size() { - return this->arg_buffer_size; -} +size_t buffer::size() { return this->arg_buffer_size; } /** * Set the size of the buffer */ -void buffer::set_size(size_t size) { - this->arg_buffer_size = size; -} +void buffer::set_size(size_t size) { this->arg_buffer_size = size; } hpx::serialization::serialize_buffer buffer::enqueue_read(size_t offset, - size_t size) { - - size_t localSize = this->arg_buffer_size - offset * size; - void* data_host; - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer:enqueue_read Set device"); - cudaMallocHost((void**) &data_host, localSize); - checkCudaError("buffer:enqueue_read allocate host memory"); - char * slicedPointer = (char*) (this->data_device) + offset; - -//Asynchronous copy from device to Host call + size_t size) { + size_t localSize = this->arg_buffer_size - offset * size; + void* data_host; + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer:enqueue_read Set device"); + cudaMallocHost((void**)&data_host, localSize); + checkCudaError("buffer:enqueue_read allocate host memory"); + char* slicedPointer = (char*)(this->data_device) + offset; + +// Asynchronous copy from device to Host call #ifdef HPXCL_CUDA_WITH_STREAMS - cudaMemcpyAsync(data_host, (void*) slicedPointer, localSize, - cudaMemcpyDeviceToHost, this->stream); - checkCudaError( - "buffer::enque_read Error during copy data from the device to the host"); - cudaStreamSynchronize(this->stream); - checkCudaError("buffer::enque_read Error during synchronization of stream"); + cudaMemcpyAsync(data_host, (void*)slicedPointer, localSize, + cudaMemcpyDeviceToHost, this->stream); + checkCudaError( + "buffer::enque_read Error during copy data from the device to the host"); + cudaStreamSynchronize(this->stream); + checkCudaError("buffer::enque_read Error during synchronization of stream"); #else - cudaMemcpyAsync(data_host, (void*) slicedPointer, localSize, - cudaMemcpyDeviceToHost); - checkCudaError( - "buffer::enque_read Error during copy data from the device to the host"); + cudaMemcpyAsync(data_host, (void*)slicedPointer, localSize, + cudaMemcpyDeviceToHost); + checkCudaError( + "buffer::enque_read Error during copy data from the device to the host"); #endif - hpx::serialization::serialize_buffer serializable_data( - (char*) reinterpret_cast(data_host), size, - hpx::serialization::serialize_buffer::init_mode::reference); - - return serializable_data; + hpx::serialization::serialize_buffer serializable_data( + (char*)reinterpret_cast(data_host), size, + hpx::serialization::serialize_buffer::init_mode::reference); + return serializable_data; } void buffer::enqueue_write(size_t offset, size_t size, - hpx::serialization::serialize_buffer data) { + hpx::serialization::serialize_buffer data) { + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer:enqueue_read Set device"); + char* slicedPointer = (char*)(data.data()) + offset; - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer:enqueue_read Set device"); - char * slicedPointer = (char*) (data.data()) + offset; - -//Asynchronous copy from Host to device call -- Non-blocking on host +// Asynchronous copy from Host to device call -- Non-blocking on host #ifdef HPXCL_CUDA_WITH_STREAMS - cudaMemcpyAsync(this->data_device, (void*) slicedPointer, size, - cudaMemcpyHostToDevice, this->stream); - checkCudaError( - "buffer::enque_write Error during copy data from the host to the device"); - cudaStreamSynchronize(this->stream); - checkCudaError("buffer::enque_read Error during synchronization of stream"); + cudaMemcpyAsync(this->data_device, (void*)slicedPointer, size, + cudaMemcpyHostToDevice, this->stream); + checkCudaError( + "buffer::enque_write Error during copy data from the host to the device"); + cudaStreamSynchronize(this->stream); + checkCudaError("buffer::enque_read Error during synchronization of stream"); #else - cudaMemcpyAsync(this->data_device, (void*) slicedPointer, size, - cudaMemcpyHostToDevice); - checkCudaError( - "buffer::enque_write Error during copy data from the host to the device"); + cudaMemcpyAsync(this->data_device, (void*)slicedPointer, size, + cudaMemcpyHostToDevice); + checkCudaError( + "buffer::enque_write Error during copy data from the host to the device"); #endif - } /** * Get the device pointer */ uintptr_t buffer::get_device_pointer() { - return reinterpret_cast(data_device); -} - -void* buffer::get_raw_pointer() -{ - return &data_device; + return reinterpret_cast(data_device); } +void* buffer::get_raw_pointer() { return &data_device; } /** * Get the device id */ -int buffer::get_device_id(){ - return this->parent_device_num; -} +int buffer::get_device_id() { return this->parent_device_num; } void buffer::enqueue_write_local(size_t offset, size_t size, uintptr_t data) { + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer:enqueue_read Set device"); + char* slicedPointer = reinterpret_cast(data) + offset; - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer:enqueue_read Set device"); - char * slicedPointer = reinterpret_cast(data) + offset; - -//Asynchronous copy from Host to device call -- Non-blocking on host +// Asynchronous copy from Host to device call -- Non-blocking on host #ifdef HPXCL_CUDA_WITH_STREAMS - cudaMemcpyAsync(this->data_device, (void*) slicedPointer, size, - cudaMemcpyHostToDevice, this->stream); - checkCudaError( - "buffer::enque_write Error during copy data from the host to the device"); - cudaStreamSynchronize(this->stream); - checkCudaError("buffer::enque_read Error during synchronization of stream"); + cudaMemcpyAsync(this->data_device, (void*)slicedPointer, size, + cudaMemcpyHostToDevice, this->stream); + checkCudaError( + "buffer::enque_write Error during copy data from the host to the device"); + cudaStreamSynchronize(this->stream); + checkCudaError("buffer::enque_read Error during synchronization of stream"); #else - cudaMemcpyAsync(this->data_device, (void*) slicedPointer, size, - cudaMemcpyHostToDevice); - checkCudaError( - "buffer::enque_write Error during copy data from the host to the device"); + cudaMemcpyAsync(this->data_device, (void*)slicedPointer, size, + cudaMemcpyHostToDevice); + checkCudaError( + "buffer::enque_write Error during copy data from the host to the device"); #endif } uintptr_t buffer::enqueue_read_local(size_t offset, size_t size) { - - size_t localSize = this->arg_buffer_size - offset * size; - void* data_host; - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer:enqueue_read Set device"); - cudaMallocHost((void**) &data_host, localSize); - checkCudaError("buffer:enqueue_read allocate host memory"); - char * slicedPointer = (char*) (this->data_device) + offset; - -//Asynchronous copy from device to host + size_t localSize = this->arg_buffer_size - offset * size; + void* data_host; + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer:enqueue_read Set device"); + cudaMallocHost((void**)&data_host, localSize); + checkCudaError("buffer:enqueue_read allocate host memory"); + char* slicedPointer = (char*)(this->data_device) + offset; + +// Asynchronous copy from device to host #ifdef HPXCL_CUDA_WITH_STREAMS - cudaMemcpyAsync(data_host, (void*) slicedPointer, localSize, - cudaMemcpyDeviceToHost, this->stream); - checkCudaError( - "buffer::enque_read Error during copy data from the device to the host"); - cudaStreamSynchronize(this->stream); - checkCudaError("buffer::enque_read Error during synchronization of stream"); + cudaMemcpyAsync(data_host, (void*)slicedPointer, localSize, + cudaMemcpyDeviceToHost, this->stream); + checkCudaError( + "buffer::enque_read Error during copy data from the device to the host"); + cudaStreamSynchronize(this->stream); + checkCudaError("buffer::enque_read Error during synchronization of stream"); #else - cudaMemcpyAsync(data_host, (void*) slicedPointer, localSize, - cudaMemcpyDeviceToHost); - checkCudaError( - "buffer::enque_read Error during copy data from the device to the host"); + cudaMemcpyAsync(data_host, (void*)slicedPointer, localSize, + cudaMemcpyDeviceToHost); + checkCudaError( + "buffer::enque_read Error during copy data from the device to the host"); #endif - return reinterpret_cast(data_host); - + return reinterpret_cast(data_host); } -void buffer::p2p_copy(uintptr_t dst, size_t dst_parent_device_id, size_t count){ - - int can_peer; - - cudaDeviceCanAccessPeer(&can_peer, this->parent_device_num, dst_parent_device_id); - checkCudaError( - "buffer::p2p Error checking peer access"); - - if ( can_peer == 0){ - - std::cerr << "Error: P2P copy between these devices is not possible!" << std::endl; - - } - else - { - cudaSetDevice(dst_parent_device_id); - checkCudaError("buffer::p2p_copy Set dest device"); - cudaDeviceEnablePeerAccess(this->parent_device_num, 0); - checkCudaError("buffer::p2p_copy Enable p2p on the parent device"); - cudaSetDevice(this->parent_device_num); - checkCudaError("buffer::p2p_copy Set source device"); - cudaDeviceEnablePeerAccess(dst_parent_device_id, 0); - checkCudaError("buffer::p2p_copy Enable p2p in the source device"); - - #ifdef HPXCL_CUDA_WITH_STREAMS - cudaMemcpyPeerAsync((void*)dst, dst_parent_device_id, this->data_device, this->parent_device_num, count, this->stream); - checkCudaError("buffer::p2p_copy Error during copy data between devices"); - cudaStreamSynchronize(this->stream); - checkCudaError("buffer::p2p_copy Error during synchronization of stream"); - - #else - cudaMemcpyPeerAsync((void*)dst, dst_parent_device_id, this->data_device, this->parent_device_num, count); - checkCudaError("buffer::p2p_copy Error during copy data between devices"); - #endif - - } -} +void buffer::p2p_copy(uintptr_t dst, size_t dst_parent_device_id, + size_t count) { + int can_peer; + + cudaDeviceCanAccessPeer(&can_peer, this->parent_device_num, + dst_parent_device_id); + checkCudaError("buffer::p2p Error checking peer access"); + if (can_peer == 0) { + std::cerr << "Error: P2P copy between these devices is not possible!" + << std::endl; + } else { + cudaSetDevice(dst_parent_device_id); + checkCudaError("buffer::p2p_copy Set dest device"); + cudaDeviceEnablePeerAccess(this->parent_device_num, 0); + checkCudaError("buffer::p2p_copy Enable p2p on the parent device"); + cudaSetDevice(this->parent_device_num); + checkCudaError("buffer::p2p_copy Set source device"); + cudaDeviceEnablePeerAccess(dst_parent_device_id, 0); + checkCudaError("buffer::p2p_copy Enable p2p in the source device"); #ifdef HPXCL_CUDA_WITH_STREAMS -cudaStream_t buffer::get_stream() { - return this->stream; -} -#endif + cudaMemcpyPeerAsync((void*)dst, dst_parent_device_id, this->data_device, + this->parent_device_num, count, this->stream); + checkCudaError("buffer::p2p_copy Error during copy data between devices"); + cudaStreamSynchronize(this->stream); + checkCudaError("buffer::p2p_copy Error during synchronization of stream"); +#else + cudaMemcpyPeerAsync((void*)dst, dst_parent_device_id, this->data_device, + this->parent_device_num, count); + checkCudaError("buffer::p2p_copy Error during copy data between devices"); +#endif + } } -} -} +#ifdef HPXCL_CUDA_WITH_STREAMS +cudaStream_t buffer::get_stream() { return this->stream; } +#endif + +} // namespace server +} // namespace cuda +} // namespace hpx diff --git a/cuda/server/device.hpp b/cuda/server/device.hpp index 0479b55e..982b4cce 100644 --- a/cuda/server/device.hpp +++ b/cuda/server/device.hpp @@ -26,146 +26,131 @@ namespace hpx { namespace cuda { namespace server { -struct device_ptr -{ - CUdeviceptr ptr; - size_t byte_count; +struct device_ptr { + CUdeviceptr ptr; + size_t byte_count; }; -template -struct host_ptr -{ - T *ptr; - size_t byte_count; +template +struct host_ptr { + T *ptr; + size_t byte_count; }; //// This class represents a cuda device ///////// -class HPX_CUDA_EXPORT device: public hpx::components::locking_hook< - hpx::components::managed_component_base > { +class HPX_CUDA_EXPORT device + : public hpx::components::locking_hook< + hpx::components::managed_component_base> { + public: + device(); -public: + device(int device_id); - device(); + ~device(); - device(int device_id); + void free(); - ~device(); + int get_device_count(); - void free(); + void set_device(int dev); - int get_device_count(); + void get_cuda_info(); - void set_device(int dev); + void get_extended_cuda_info(); - void get_cuda_info(); + int get_device_id(); - void get_extended_cuda_info(); + int get_context(); - int get_device_id(); + int get_device_architecture_major(); - int get_context(); + int get_device_architecture_minor(); - int get_device_architecture_major(); + int get_all_devices(); - int get_device_architecture_minor(); + static void do_wait(boost::shared_ptr> p); - int get_all_devices(); + static hpx::future wait(); - static void do_wait(boost::shared_ptr > p); + void create_device_ptr(size_t const byte_count); - static hpx::future wait(); + template + void create_host_ptr(T value, size_t const byte_count) { + host_ptr temp; + temp.ptr = (T *)malloc(byte_count); + (temp.ptr) = value; + temp.byte_count = byte_count; + host_ptrs.push_back(temp); + } - void create_device_ptr(size_t const byte_count); + hpx::cuda::program create_program_with_source(std::string source); - template - void create_host_ptr(T value, size_t const byte_count) { - host_ptr temp; - temp.ptr = (T*) malloc(byte_count); - (temp.ptr) = value; - temp.byte_count = byte_count; - host_ptrs.push_back(temp); - } + hpx::cuda::program create_program_with_file(std::string file); - hpx::cuda::program create_program_with_source(std::string source); + hpx::cuda::buffer create_buffer(size_t size); - hpx::cuda::program create_program_with_file(std::string file); + HPX_DEFINE_COMPONENT_ACTION(device, get_cuda_info); + HPX_DEFINE_COMPONENT_ACTION(device, get_extended_cuda_info); + HPX_DEFINE_COMPONENT_ACTION(device, get_device_architecture_major); + HPX_DEFINE_COMPONENT_ACTION(device, get_device_architecture_minor); + HPX_DEFINE_COMPONENT_ACTION(device, set_device); + HPX_DEFINE_COMPONENT_ACTION(device, get_all_devices); + HPX_DEFINE_COMPONENT_ACTION(device, get_device_id); + HPX_DEFINE_COMPONENT_ACTION(device, get_context); + HPX_DEFINE_COMPONENT_ACTION(device, wait); + HPX_DEFINE_COMPONENT_ACTION(device, create_program_with_source); + HPX_DEFINE_COMPONENT_ACTION(device, create_buffer); - hpx::cuda::buffer create_buffer(size_t size); + template + struct create_host_ptr_action + : hpx::actions::make_action, + create_host_ptr_action> {}; + private: + unsigned int device_id; + unsigned int context_id; + CUdevice cu_device; + CUcontext cu_context; + std::string device_name; + cudaDeviceProp props; + std::vector device_ptrs; + std::vector> host_ptrs; + int num_args; - HPX_DEFINE_COMPONENT_ACTION(device, get_cuda_info); - HPX_DEFINE_COMPONENT_ACTION(device, get_extended_cuda_info); - HPX_DEFINE_COMPONENT_ACTION(device, get_device_architecture_major); - HPX_DEFINE_COMPONENT_ACTION(device, get_device_architecture_minor); - HPX_DEFINE_COMPONENT_ACTION(device, set_device); - HPX_DEFINE_COMPONENT_ACTION(device, get_all_devices); - HPX_DEFINE_COMPONENT_ACTION(device, get_device_id); - HPX_DEFINE_COMPONENT_ACTION(device, get_context); - HPX_DEFINE_COMPONENT_ACTION(device, wait); - HPX_DEFINE_COMPONENT_ACTION(device, create_program_with_source); - HPX_DEFINE_COMPONENT_ACTION(device, create_buffer); - - - template - struct create_host_ptr_action - : hpx::actions::make_action< - void (device::*)(T), &device::template create_host_ptr, - create_host_ptr_action > - { - }; - -private: - unsigned int device_id; - unsigned int context_id; - CUdevice cu_device; - CUcontext cu_context; - std::string device_name; - cudaDeviceProp props; - std::vector device_ptrs; - std::vector> host_ptrs; - int num_args; - - void print2D(std::string name, int * array); - void print3D(std::string name, int * array); - + void print2D(std::string name, int *array); + void print3D(std::string name, int *array); }; -} -} -} +} // namespace server +} // namespace cuda +} // namespace hpx -HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::device::get_cuda_info_action, - device_get_cuda_info_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::device::get_cuda_info_action, + device_get_cuda_info_action); HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::device::get_device_architecture_major_action, - get_device_architecture_major_action); + get_device_architecture_major_action); HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::device::get_device_architecture_minor_action, - get_device_architecture_minor_action); + get_device_architecture_minor_action); HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::device::get_extended_cuda_info_action, device_get_extended_cuda_info_action); -HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::device::set_device_action, - device_set_device_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::device::set_device_action, + device_set_device_action); HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::device::get_all_devices_action, device_get_all_devices_action); -HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::device::get_device_id_action, - device_get_device_id_action); -HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::device::get_context_action, - device__get_context_action); -HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::device::wait_action, - device_wait_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::device::get_device_id_action, + device_get_device_id_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::device::get_context_action, + device__get_context_action); +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::device::wait_action, + device_wait_action); HPX_REGISTER_ACTION_DECLARATION( hpx::cuda::server::device::create_program_with_source_action, device_create_program_with_source_action); -HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::device::create_buffer_action, - device_create_buffer_action); - +HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::device::create_buffer_action, + device_create_buffer_action); -#endif //cuda_device_2_HPP +#endif // cuda_device_2_HPP diff --git a/cuda/server/device_server.cpp b/cuda/server/device_server.cpp index e6f973fd..e65647ae 100644 --- a/cuda/server/device_server.cpp +++ b/cuda/server/device_server.cpp @@ -4,7 +4,7 @@ // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt #include -#include +#include #include #include @@ -16,204 +16,183 @@ #include "cuda/server/device.hpp" - namespace hpx { namespace cuda { namespace server { device::device() { - cuInit(0); - cuDeviceGet(&cu_device, 0); - checkCudaError("device::device Init"); - //cuCtxCreate(&cu_context, 0, cu_device); - //checkCudaError("device::device Create context"); - device_name = props.name; + cuInit(0); + cuDeviceGet(&cu_device, 0); + checkCudaError("device::device Init"); + // cuCtxCreate(&cu_context, 0, cu_device); + // checkCudaError("device::device Create context"); + device_name = props.name; } device::device(int device_id) { - cuInit(0); - checkCudaError("device::device Init"); - cuDeviceGet(&cu_device, device_id); - checkCudaError("device::device Get device"); - //cuCtxCreate(&cu_context, 0, cu_device); - //checkCudaError("device::device Create context"); - this->set_device(device_id); - - cudaGetDeviceProperties(&props, device_id); - checkCudaError("device::device Get properties "); - - this->device_name = props.name; -} + cuInit(0); + checkCudaError("device::device Init"); + cuDeviceGet(&cu_device, device_id); + checkCudaError("device::device Get device"); + // cuCtxCreate(&cu_context, 0, cu_device); + // checkCudaError("device::device Create context"); + this->set_device(device_id); -device::~device() { + cudaGetDeviceProperties(&props, device_id); + checkCudaError("device::device Get properties "); + + this->device_name = props.name; } +device::~device() {} int device::get_device_count() { - int device_count = 0; - cuDeviceGetCount(&device_count); - checkCudaError("device::device::get_device_count Get device count"); - return device_count; + int device_count = 0; + cuDeviceGetCount(&device_count); + checkCudaError("device::device::get_device_count Get device count"); + return device_count; } void device::set_device(int dev) { - this->device_id = dev; - //cuCtxSetCurrent(cu_context); - checkCudaError("device::device::set_device Set context "); + this->device_id = dev; + // cuCtxSetCurrent(cu_context); + checkCudaError("device::device::set_device Set context "); } void device::get_cuda_info() { - const int kb = 1024; - const int mb = kb * kb; - - cudaDeviceProp props; - cudaError_t error = - cudaGetDeviceProperties(&props, this->device_id); - checkCudaError("device::device::get_cuda_info Get properties "); - if (error == cudaErrorInvalidDevice) { - std::cout << "Device does not exist" << std::endl; - } - - std::cout << props.name << std::endl; - std::cout << " Global memory: " << props.totalGlobalMem / mb << "mb" - << std::endl; - std::cout << " Shared memory: " << props.sharedMemPerBlock / kb << "kb" - << std::endl; - std::cout << " Constant memory: " << props.totalConstMem / kb << "kb" - << std::endl; - std::cout << " Block registers: " << props.regsPerBlock << std::endl - << std::endl; - std::cout << " Warp size: " << props.warpSize << std::endl; - std::cout << " Threads per block: " << props.maxThreadsPerBlock - << std::endl; - std::cout << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", " - << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" - << std::endl; - std::cout << " Max grid dimensions: [ " << props.maxGridSize[0] << ", " - << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" - << std::endl; - std::cout << " Multiprocessor Count: " << props.multiProcessorCount - << std::endl; - std::cout << std::endl; - std::cout << " Unified addressing: " << props.unifiedAddressing - << std::endl; - std::cout << " Concurrent kernels: " << props.concurrentKernels - << std::endl; - std::cout << " Diver Overlap: " << props.deviceOverlap << std::endl; - std::cout << " Memory Clock Rate: " << props.memoryClockRate << std::endl; - std::cout << " Memory Bus Width: " << props.memoryBusWidth << std::endl; - std::cout << " l2 Cache Size: " << props.l2CacheSize << std::endl; - std::cout << " Clock Rate: " << props.clockRate << std::endl; - std::cout << " Exec Time Out: " << props.kernelExecTimeoutEnabled - << std::endl << std::endl; - - std::cout << " Compute Capability: " << props.major << "." << props.minor - << std::endl; - std::cout << " Compute Modes: " << props.computeMode << std::endl - << std::endl; - - //} + const int kb = 1024; + const int mb = kb * kb; + + cudaDeviceProp props; + cudaError_t error = cudaGetDeviceProperties(&props, this->device_id); + checkCudaError("device::device::get_cuda_info Get properties "); + if (error == cudaErrorInvalidDevice) { + std::cout << "Device does not exist" << std::endl; + } + + std::cout << props.name << std::endl; + std::cout << " Global memory: " << props.totalGlobalMem / mb << "mb" + << std::endl; + std::cout << " Shared memory: " << props.sharedMemPerBlock / kb << "kb" + << std::endl; + std::cout << " Constant memory: " << props.totalConstMem / kb << "kb" + << std::endl; + std::cout << " Block registers: " << props.regsPerBlock << std::endl + << std::endl; + std::cout << " Warp size: " << props.warpSize << std::endl; + std::cout << " Threads per block: " << props.maxThreadsPerBlock + << std::endl; + std::cout << " Max block dimensions: [ " << props.maxThreadsDim[0] << ", " + << props.maxThreadsDim[1] << ", " << props.maxThreadsDim[2] << " ]" + << std::endl; + std::cout << " Max grid dimensions: [ " << props.maxGridSize[0] << ", " + << props.maxGridSize[1] << ", " << props.maxGridSize[2] << " ]" + << std::endl; + std::cout << " Multiprocessor Count: " << props.multiProcessorCount + << std::endl; + std::cout << std::endl; + std::cout << " Unified addressing: " << props.unifiedAddressing + << std::endl; + std::cout << " Concurrent kernels: " << props.concurrentKernels + << std::endl; + std::cout << " Diver Overlap: " << props.deviceOverlap << std::endl; + std::cout << " Memory Clock Rate: " << props.memoryClockRate << std::endl; + std::cout << " Memory Bus Width: " << props.memoryBusWidth << std::endl; + std::cout << " l2 Cache Size: " << props.l2CacheSize << std::endl; + std::cout << " Clock Rate: " << props.clockRate << std::endl; + std::cout << " Exec Time Out: " << props.kernelExecTimeoutEnabled + << std::endl + << std::endl; + + std::cout << " Compute Capability: " << props.major << "." << props.minor + << std::endl; + std::cout << " Compute Modes: " << props.computeMode << std::endl + << std::endl; + + //} } void device::get_extended_cuda_info() { + this->get_cuda_info(); - this->get_cuda_info(); - - std::cout << " Max Texture 1D: " << props.maxTexture1D << std::endl; - std::cout << " Max Texture 1D Linear: " << props.maxTexture1DLinear - << std::endl; - this->print2D("Max Texture 2D", props.maxTexture2D); - this->print3D("Max Texture 2D Linear", props.maxTexture2DLinear); - this->print2D("Max Texture 2D Gather", props.maxTexture2DGather); - this->print3D("Max Texture 3D", props.maxTexture3D); - std::cout << " Max Texture Cubemap: " << props.maxTextureCubemap - << std::endl; - this->print2D("Max Texture 1D Layered", props.maxTexture1DLayered); - this->print3D("Max Texture 2D Layered", props.maxTexture2DLayered); - this->print2D("Max Texture Cubemap Layered", - props.maxTextureCubemapLayered); - std::cout << " Max Surface 1D: " << props.maxSurface1D << std::endl; - this->print2D("Max Surface 2D", props.maxSurface2D); - this->print3D("Max Surface 3D", props.maxSurface3D); - this->print2D("Max Surface 1D Layered", props.maxSurface1DLayered); - this->print3D("Max Surface 2D layered", props.maxSurface2DLayered); - std::cout << " Max Surface Cubemap: " << props.maxSurfaceCubemap - << std::endl; - this->print2D("Max Surface Cubemap Layered", - props.maxSurfaceCubemapLayered); - std::cout << " Surface Alignment: " << props.surfaceAlignment - << std::endl; - + std::cout << " Max Texture 1D: " << props.maxTexture1D << std::endl; + std::cout << " Max Texture 1D Linear: " << props.maxTexture1DLinear + << std::endl; + this->print2D("Max Texture 2D", props.maxTexture2D); + this->print3D("Max Texture 2D Linear", props.maxTexture2DLinear); + this->print2D("Max Texture 2D Gather", props.maxTexture2DGather); + this->print3D("Max Texture 3D", props.maxTexture3D); + std::cout << " Max Texture Cubemap: " << props.maxTextureCubemap + << std::endl; + this->print2D("Max Texture 1D Layered", props.maxTexture1DLayered); + this->print3D("Max Texture 2D Layered", props.maxTexture2DLayered); + this->print2D("Max Texture Cubemap Layered", props.maxTextureCubemapLayered); + std::cout << " Max Surface 1D: " << props.maxSurface1D << std::endl; + this->print2D("Max Surface 2D", props.maxSurface2D); + this->print3D("Max Surface 3D", props.maxSurface3D); + this->print2D("Max Surface 1D Layered", props.maxSurface1DLayered); + this->print3D("Max Surface 2D layered", props.maxSurface2DLayered); + std::cout << " Max Surface Cubemap: " << props.maxSurfaceCubemap + << std::endl; + this->print2D("Max Surface Cubemap Layered", props.maxSurfaceCubemapLayered); + std::cout << " Surface Alignment: " << props.surfaceAlignment << std::endl; } -void device::print2D(std::string name, int * array) { - - std::cout << " " << name << ": [ " << array[0] << ", " << array[1] << " ]" - << std::endl; +void device::print2D(std::string name, int* array) { + std::cout << " " << name << ": [ " << array[0] << ", " << array[1] << " ]" + << std::endl; } -void device::print3D(std::string name, int * array) { - - std::cout << " " << name << ": [ " << array[0] << ", " << array[1] << ", " - << array[2] << " ]" << std::endl; +void device::print3D(std::string name, int* array) { + std::cout << " " << name << ": [ " << array[0] << ", " << array[1] << ", " + << array[2] << " ]" << std::endl; } -int device::get_device_id() { - return this->device_id; -} +int device::get_device_id() { return this->device_id; } -int device::get_context() { - return this->context_id; -} +int device::get_context() { return this->context_id; } int device::get_all_devices() { - int num_devices = get_device_count(); - return num_devices; + int num_devices = get_device_count(); + return num_devices; } void device::do_wait(boost::shared_ptr > p) { - p->set_value(0); + p->set_value(0); } hpx::lcos::future device::wait() { - boost::shared_ptr > p = boost::make_shared< - hpx::lcos::local::promise >(); + boost::shared_ptr > p = + boost::make_shared >(); - hpx::util::io_service_pool* pool = hpx::get_runtime().get_thread_pool( - "io_pool"); - pool->get_io_service().post(hpx::util::bind(&do_wait, p)); + hpx::util::io_service_pool* pool = + hpx::get_runtime().get_thread_pool("io_pool"); + pool->get_io_service().post(hpx::util::bind(&do_wait, p)); - return p->get_future(); + return p->get_future(); } - hpx::cuda::program device::create_program_with_source(std::string source) { - typedef hpx::cuda::server::program program_type; + typedef hpx::cuda::server::program program_type; - hpx::cuda::program cu_program( - hpx::components::new_ < program_type > (hpx::find_here(),this->device_id)); - cu_program.set_source_sync(source); - return cu_program; + hpx::cuda::program cu_program( + hpx::components::new_(hpx::find_here(), this->device_id)); + cu_program.set_source_sync(source); + return cu_program; } hpx::cuda::buffer device::create_buffer(size_t size) { - typedef hpx::cuda::server::buffer buffer_type; - - hpx::cuda::buffer cu_buffer( - hpx::components::new_ < buffer_type > (hpx::find_here(),size,this->device_id)); - - return cu_buffer; -} + typedef hpx::cuda::server::buffer buffer_type; -int device::get_device_architecture_major() { + hpx::cuda::buffer cu_buffer(hpx::components::new_( + hpx::find_here(), size, this->device_id)); - return this->props.major; + return cu_buffer; } -int device::get_device_architecture_minor() { +int device::get_device_architecture_major() { return this->props.major; } - return this->props.minor; -} -} -} -} +int device::get_device_architecture_minor() { return this->props.minor; } +} // namespace server +} // namespace cuda +} // namespace hpx diff --git a/cuda/server/get_devices.hpp b/cuda/server/get_devices.hpp index 52d41e52..a818bd6d 100644 --- a/cuda/server/get_devices.hpp +++ b/cuda/server/get_devices.hpp @@ -25,12 +25,13 @@ namespace server { // Global cuda functions // Returns the IDs of all devices on current host -HPX_CUDA_EXPORT std::vector get_devices(int major, int minor); +HPX_CUDA_EXPORT std::vector get_devices(int major, + int minor); HPX_DEFINE_PLAIN_ACTION(get_devices, get_devices_action); -} -} -} +} // namespace server +} // namespace cuda +} // namespace hpx #endif diff --git a/cuda/server/get_devices_server.cpp b/cuda/server/get_devices_server.cpp index ec5ad859..4132df23 100644 --- a/cuda/server/get_devices_server.cpp +++ b/cuda/server/get_devices_server.cpp @@ -17,33 +17,29 @@ namespace cuda { namespace server { -std::vector get_devices(int major, int minor) -{ - std::vector devices; +std::vector get_devices(int major, int minor) { + std::vector devices; - int count = 0; + int count = 0; - cudaGetDeviceCount(&count); - checkCudaError("get_devices"); - - for (int device_id = 0; device_id < count; ++device_id) - { - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, device_id); - checkCudaError("get_devices"); + cudaGetDeviceCount(&count); + checkCudaError("get_devices"); - if (prop.major >= major && prop.minor >= minor) - devices.push_back(hpx::cuda::device(find_here(), device_id)); - } + for (int device_id = 0; device_id < count; ++device_id) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, device_id); + checkCudaError("get_devices"); - return devices; -} + if (prop.major >= major && prop.minor >= minor) + devices.push_back(hpx::cuda::device(find_here(), device_id)); + } -} -} + return devices; } -HPX_PLAIN_ACTION( - hpx::cuda::server::get_devices, - hpx_cuda_server_get_devices_action); +} // namespace server +} // namespace cuda +} // namespace hpx +HPX_PLAIN_ACTION(hpx::cuda::server::get_devices, + hpx_cuda_server_get_devices_action); diff --git a/cuda/server/program.hpp b/cuda/server/program.hpp index 45649eae..05042b24 100644 --- a/cuda/server/program.hpp +++ b/cuda/server/program.hpp @@ -28,94 +28,96 @@ namespace hpx { namespace cuda { namespace server { -class HPX_CUDA_EXPORT program: public hpx::components::locking_hook< - hpx::components::managed_component_base > { -private: - - int parent_device_id; - std::string kernel_source; - std::string kernel_name; - nvrtcProgram prog; - std::map kernels; - - //Use no-default stream if defined +class HPX_CUDA_EXPORT program + : public hpx::components::locking_hook< + hpx::components::managed_component_base > { + private: + int parent_device_id; + std::string kernel_source; + std::string kernel_name; + nvrtcProgram prog; + std::map kernels; + + // Use no-default stream if defined #ifdef HPXCL_CUDA_WITH_STREAMS - std::vector streams; + std::vector streams; #endif - - CUmodule module; -public: - struct Dim3 { - unsigned int x, y, z; - template - void serialize(Archive &ar, unsigned int i) { - ar & x; - ar & y; - ar & z; - } - }; + CUmodule module; - program(); + public: + struct Dim3 { + unsigned int x, y, z; + template + void serialize(Archive &ar, unsigned int i) { + ar &x; + ar &y; + ar &z; + } + }; - program(int parent_device_id); + program(); - program(hpx::naming::id_type device_id, std::string code); + program(int parent_device_id); - program(hpx::naming::id_type device_id, - hpx::serialization::serialize_buffer binary); + program(hpx::naming::id_type device_id, std::string code); - ~program(); + program(hpx::naming::id_type device_id, + hpx::serialization::serialize_buffer binary); - void build(std::vector compilerFlags, - std::vector modulenames, unsigned int debug = 0); + ~program(); - void set_source(std::string source); + void build(std::vector compilerFlags, + std::vector modulenames, unsigned int debug = 0); + + void set_source(std::string source); #ifdef HPXCL_CUDA_WITH_STREAMS - void run(std::vector args, std::string modulename, - Dim3 grid, Dim3 block, std::vector dependencies, - size_t shared_memory, int stream = -1); + void run(std::vector args, std::string modulename, + Dim3 grid, Dim3 block, + std::vector dependencies, size_t shared_memory, + int stream = -1); - unsigned int get_streams_size(); - unsigned int create_stream(); + unsigned int get_streams_size(); + unsigned int create_stream(); #else - void run(std::vector args, std::string modulename, - Dim3 grid, Dim3 block,size_t shared_memory); + void run(std::vector args, std::string modulename, + Dim3 grid, Dim3 block, size_t shared_memory); #endif - HPX_DEFINE_COMPONENT_ACTION(program, build); - HPX_DEFINE_COMPONENT_ACTION(program, set_source); - HPX_DEFINE_COMPONENT_ACTION(program, run); + HPX_DEFINE_COMPONENT_ACTION(program, build); + HPX_DEFINE_COMPONENT_ACTION(program, set_source); + HPX_DEFINE_COMPONENT_ACTION(program, run); #ifdef HPXCL_CUDA_WITH_STREAMS - HPX_DEFINE_COMPONENT_ACTION(program, get_streams_size); - HPX_DEFINE_COMPONENT_ACTION(program, create_stream); + HPX_DEFINE_COMPONENT_ACTION(program, get_streams_size); + HPX_DEFINE_COMPONENT_ACTION(program, create_stream); #endif - }; -} -} -} +} // namespace server +} // namespace cuda +} // namespace hpx -HPX_DECLARE_ACTION(hpx::cuda::server::program::build_action, cuda_program_build_action); +HPX_DECLARE_ACTION(hpx::cuda::server::program::build_action, + cuda_program_build_action); HPX_ACTION_USES_MEDIUM_STACK(hpx::cuda::server::program::build_action); HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::program::build_action, - cuda_program_build_action); + cuda_program_build_action); HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::program::run_action, - cuda_program_run_action); + cuda_program_run_action); HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::program::set_source_action, - cuda_program_set_source_action); + cuda_program_set_source_action); #ifdef HPXCL_CUDA_WITH_STREAMS -HPX_REGISTER_ACTION_DECLARATION(hpx::cuda::server::program::get_streams_size_action, - cuda_get_streams_size_action); HPX_REGISTER_ACTION_DECLARATION( - hpx::cuda::server::program::create_stream_action, - cuda_create_stream_action); + hpx::cuda::server::program::get_streams_size_action, + cuda_get_streams_size_action); +HPX_REGISTER_ACTION_DECLARATION( + hpx::cuda::server::program::create_stream_action, + cuda_create_stream_action); #endif -#endif //PROGRAM_2_HPP +#endif // PROGRAM_2_HPP diff --git a/cuda/server/program_server.cpp b/cuda/server/program_server.cpp index 93b1bc63..43587eb3 100644 --- a/cuda/server/program_server.cpp +++ b/cuda/server/program_server.cpp @@ -21,286 +21,273 @@ namespace hpx { namespace cuda { namespace server { -program::program() { - -} +program::program() {} /** -* Default constructor -*/ + * Default constructor + */ program::program(int parent_device_id) { - this->parent_device_id = parent_device_id; - - //Sets the device on which program should be executed - cudaSetDevice(parent_device_id); - checkCudaError("program::program Error setting the device"); + this->parent_device_id = parent_device_id; + + // Sets the device on which program should be executed + cudaSetDevice(parent_device_id); + checkCudaError("program::program Error setting the device"); #ifdef HPXCL_CUDA_WITH_STREAMS - cudaStream_t stream; - cudaStreamCreate(&stream); - checkCudaError("program::program Error in creating default stream"); - this->streams.push_back(stream); + cudaStream_t stream; + cudaStreamCreate(&stream); + checkCudaError("program::program Error in creating default stream"); + this->streams.push_back(stream); #endif } -program::program(hpx::naming::id_type device_id, std::string code) { -} +program::program(hpx::naming::id_type device_id, std::string code) {} program::program(hpx::naming::id_type device_id, - hpx::serialization::serialize_buffer binary) { -} + hpx::serialization::serialize_buffer binary) {} /** -* Default destructor -*/ + * Default destructor + */ program::~program() { + cudaSetDevice(this->parent_device_id); + checkCudaError("program::~program set device"); - cudaSetDevice(this->parent_device_id); - checkCudaError("program::~program set device"); - - //Destroy the program in device - nvrtcDestroyProgram(&prog); - checkCudaError("program::~program Destroy Program"); + // Destroy the program in device + nvrtcDestroyProgram(&prog); + checkCudaError("program::~program Destroy Program"); #ifdef HPXCL_CUDA_WITH_STREAMS - for (auto stream : streams) { - cudaStreamDestroy(stream); - checkCudaError("program::~program Destroy stream"); - } + for (auto stream : streams) { + cudaStreamDestroy(stream); + checkCudaError("program::~program Destroy stream"); + } #endif - //Destroy the modules - cuModuleUnload(module); - checkCudaError("program::~program Destroy module"); + // Destroy the modules + cuModuleUnload(module); + checkCudaError("program::~program Destroy module"); } /** -* Set the kernels which are represented as string in source -*/ -void program::set_source(std::string source) { - this->kernel_source = source; -} + * Set the kernels which are represented as string in source + */ +void program::set_source(std::string source) { this->kernel_source = source; } /** -* Build the provided source kernel -*/ + * Build the provided source kernel + */ void program::build(std::vector compilerFlags, - std::vector modulenames, unsigned int debug) { - - // Set CUDA device to be used - cudaSetDevice(this->parent_device_id); - checkCudaError("program::build Set device"); - - // Convert the kernel provided into a .cu file to be used for building - boost::uuids::uuid uuid = boost::uuids::random_generator()(); - std::string filename = to_string(uuid); - filename.append(".cu"); - - //Provide flags associated with the debug mode - if (debug == 1) { - compilerFlags.push_back("-G"); - compilerFlags.push_back("-lineinfo"); - } - - //Create a CUDA program with the source - nvrtcCreateProgram(&prog, this->kernel_source.c_str(), filename.c_str(), 0, - NULL, NULL); - checkCudaError("program::build Create Program"); - - //convert compiler flags to string - std::vector opts(compilerFlags.size()); - unsigned int i = 0; - for (auto opt : compilerFlags) { - opts[i] = compilerFlags[i].c_str(); - i++; - } - - //Compile the program with flags - nvrtcResult compileResult = nvrtcCompileProgram(prog, (int)compilerFlags.size(), - opts.data()); - - //Log details in case of error - if (compileResult != NVRTC_SUCCESS) { - size_t logSize; - nvrtcGetProgramLogSize(prog, &logSize); - checkCudaError("program::build Create Log"); - char *log = new char[logSize]; - nvrtcGetProgramLog(prog, log); - checkCudaError("program::build Get Log"); - - std::cout << log << std::endl; - delete[] log; - exit(1); - } - - //Get size of NVIDIA PTX - size_t ptxSize; - nvrtcGetPTXSize(prog, &ptxSize); - checkCudaError("program::build Get ptx size"); - - char *ptx = new char[ptxSize]; - nvrtcGetPTX(prog, ptx); - checkCudaError("program::build Get ptx of Program"); - - //Load the module in ptx - cuModuleLoadDataEx(&module, ptx, 0, 0, 0); - checkCudaError("program::build Load Module"); - - //Build the kernel module from the vector - for (auto modulename : modulenames) { - CUfunction kernel; - cuModuleGetFunction(&kernel, module, modulename.c_str()); - checkCudaError("program::build Get Function"); - kernels.insert(std::pair(modulename, kernel)); - } - + std::vector modulenames, unsigned int debug) { + // Set CUDA device to be used + cudaSetDevice(this->parent_device_id); + checkCudaError("program::build Set device"); + + // Convert the kernel provided into a .cu file to be used for building + boost::uuids::uuid uuid = boost::uuids::random_generator()(); + std::string filename = to_string(uuid); + filename.append(".cu"); + + // Provide flags associated with the debug mode + if (debug == 1) { + compilerFlags.push_back("-G"); + compilerFlags.push_back("-lineinfo"); + } + + // Create a CUDA program with the source + nvrtcCreateProgram(&prog, this->kernel_source.c_str(), filename.c_str(), 0, + NULL, NULL); + checkCudaError("program::build Create Program"); + + // convert compiler flags to string + std::vector opts(compilerFlags.size()); + unsigned int i = 0; + for (auto opt : compilerFlags) { + opts[i] = compilerFlags[i].c_str(); + i++; + } + + // Compile the program with flags + nvrtcResult compileResult = + nvrtcCompileProgram(prog, (int)compilerFlags.size(), opts.data()); + + // Log details in case of error + if (compileResult != NVRTC_SUCCESS) { + size_t logSize; + nvrtcGetProgramLogSize(prog, &logSize); + checkCudaError("program::build Create Log"); + char *log = new char[logSize]; + nvrtcGetProgramLog(prog, log); + checkCudaError("program::build Get Log"); + + std::cout << log << std::endl; + delete[] log; + exit(1); + } + + // Get size of NVIDIA PTX + size_t ptxSize; + nvrtcGetPTXSize(prog, &ptxSize); + checkCudaError("program::build Get ptx size"); + + char *ptx = new char[ptxSize]; + nvrtcGetPTX(prog, ptx); + checkCudaError("program::build Get ptx of Program"); + + // Load the module in ptx + cuModuleLoadDataEx(&module, ptx, 0, 0, 0); + checkCudaError("program::build Load Module"); + + // Build the kernel module from the vector + for (auto modulename : modulenames) { + CUfunction kernel; + cuModuleGetFunction(&kernel, module, modulename.c_str()); + checkCudaError("program::build Get Function"); + kernels.insert(std::pair(modulename, kernel)); + } } #ifdef HPXCL_CUDA_WITH_STREAMS /** -* Run the provided source kernel -*/ + * Run the provided source kernel + */ void program::run(std::vector args, - std::string modulename, Dim3 grid, Dim3 block, - std::vector dependencies, size_t shared_memory, int stream) { - - std::vector args_pointer(args.size()); - - //void *args_pointer[args.size()]; - - unsigned int i = 0; - for (auto arg : args) { - auto buffer = hpx::get_ptr(arg).get(); - void* tmp = buffer->get_raw_pointer(); - args_pointer[i] = tmp; - i++; - } - - cudaSetDevice(this->parent_device_id); - checkCudaError("program::run Error setting the device"); - - //Run on the default stream - if (dependencies.size() == 0 and stream == -1) { - cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, - block.x, block.y, block.z, shared_memory, this->streams[0], args_pointer.data(), - 0); - checkCudaError("program::run Run kernel"); - cudaStreamSynchronize(this->streams[0]); - checkCudaError("program::run Error during synchronization of stream"); - } - //Run on the provided stream - else if (dependencies.size() == 0 and stream >= 0) { - cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, - block.x, block.y, block.z, shared_memory, this->streams[stream], // shared mem and stream - args_pointer.data(), 0); - checkCudaError("program::run Run kernel"); - cudaStreamSynchronize(this->streams[stream]); - checkCudaError("program::run Error during synchronization of stream"); - } - //Run on the one stream of the dependency - else if (dependencies.size() == 1 and stream == -1) { - auto buffer = - hpx::get_ptr(dependencies[0]).get(); - cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, - block.x, block.y, block.z, shared_memory, buffer->get_stream(), // shared mem and stream - args_pointer.data(), 0); - checkCudaError("program::run Run kernel"); - cudaStreamSynchronize(buffer->get_stream()); - checkCudaError("program::run Error during synchronization of stream"); - - } - //Run on the provided stream - else if (dependencies.size() > 1 and stream >= 0) { - for (auto dependency : dependencies) { - auto buffer = hpx::get_ptr( - dependencies[0]).get(); - cudaStreamSynchronize(buffer->get_stream()); - checkCudaError( - "program::run Error during synchronization of stream"); - } - - cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, - block.x, block.y, block.z, shared_memory, this->streams[stream], // shared mem and stream - args_pointer.data(), 0); - checkCudaError("program::run Run kernel"); - cudaStreamSynchronize(this->streams[stream]); - checkCudaError("program::run Error during synchronization of stream"); - } else { - for (auto dependency : dependencies) { - auto buffer = hpx::get_ptr( - dependencies[0]).get(); - cudaStreamSynchronize(buffer->get_stream()); - checkCudaError( - "program::run Error during synchronization of stream"); - } - - cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, - block.x, block.y, block.z, shared_memory, this->streams[0], // shared mem and stream - args_pointer.data(), 0); - checkCudaError("program::run Run kernel"); - cudaStreamSynchronize(this->streams[0]); - checkCudaError("program::run Error during synchronization of stream"); - } - + std::string modulename, Dim3 grid, Dim3 block, + std::vector dependencies, + size_t shared_memory, int stream) { + std::vector args_pointer(args.size()); + + // void *args_pointer[args.size()]; + + unsigned int i = 0; + for (auto arg : args) { + auto buffer = hpx::get_ptr(arg).get(); + void *tmp = buffer->get_raw_pointer(); + args_pointer[i] = tmp; + i++; + } + + cudaSetDevice(this->parent_device_id); + checkCudaError("program::run Error setting the device"); + + // Run on the default stream + if (dependencies.size() == 0 and stream == -1) { + cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, block.x, + block.y, block.z, shared_memory, this->streams[0], + args_pointer.data(), 0); + checkCudaError("program::run Run kernel"); + cudaStreamSynchronize(this->streams[0]); + checkCudaError("program::run Error during synchronization of stream"); + } + // Run on the provided stream + else if (dependencies.size() == 0 and stream >= 0) { + cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, block.x, + block.y, block.z, shared_memory, + this->streams[stream], // shared mem and stream + args_pointer.data(), 0); + checkCudaError("program::run Run kernel"); + cudaStreamSynchronize(this->streams[stream]); + checkCudaError("program::run Error during synchronization of stream"); + } + // Run on the one stream of the dependency + else if (dependencies.size() == 1 and stream == -1) { + auto buffer = + hpx::get_ptr(dependencies[0]).get(); + cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, block.x, + block.y, block.z, shared_memory, + buffer->get_stream(), // shared mem and stream + args_pointer.data(), 0); + checkCudaError("program::run Run kernel"); + cudaStreamSynchronize(buffer->get_stream()); + checkCudaError("program::run Error during synchronization of stream"); + + } + // Run on the provided stream + else if (dependencies.size() > 1 and stream >= 0) { + for (auto dependency : dependencies) { + auto buffer = + hpx::get_ptr(dependencies[0]).get(); + cudaStreamSynchronize(buffer->get_stream()); + checkCudaError("program::run Error during synchronization of stream"); + } + + cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, block.x, + block.y, block.z, shared_memory, + this->streams[stream], // shared mem and stream + args_pointer.data(), 0); + checkCudaError("program::run Run kernel"); + cudaStreamSynchronize(this->streams[stream]); + checkCudaError("program::run Error during synchronization of stream"); + } else { + for (auto dependency : dependencies) { + auto buffer = + hpx::get_ptr(dependencies[0]).get(); + cudaStreamSynchronize(buffer->get_stream()); + checkCudaError("program::run Error during synchronization of stream"); + } + + cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, block.x, + block.y, block.z, shared_memory, + this->streams[0], // shared mem and stream + args_pointer.data(), 0); + checkCudaError("program::run Run kernel"); + cudaStreamSynchronize(this->streams[0]); + checkCudaError("program::run Error during synchronization of stream"); + } } /** -* Get the size of streams vector -*/ -unsigned int program::get_streams_size() { - return (int)this->streams.size(); -} + * Get the size of streams vector + */ +unsigned int program::get_streams_size() { return (int)this->streams.size(); } /** -* Create a new stream -*/ + * Create a new stream + */ unsigned int program::create_stream() { - cudaSetDevice(parent_device_id); - checkCudaError("program::program Error setting the device"); - cudaStream_t stream; - cudaStreamCreate(&stream); - checkCudaError("program::program Error in creating a stream"); - this->streams.push_back(stream); - return (int)this->streams.size() - 1; + cudaSetDevice(parent_device_id); + checkCudaError("program::program Error setting the device"); + cudaStream_t stream; + cudaStreamCreate(&stream); + checkCudaError("program::program Error in creating a stream"); + this->streams.push_back(stream); + return (int)this->streams.size() - 1; } #else /** -* Run the compiled source kernel -*/ + * Run the compiled source kernel + */ void program::run(std::vector args, - std::string modulename, Dim3 grid, Dim3 block) { - - std::vector args_pointer(args.size()); - - //Retrieve the pointers of the data containing parameters for the kernel - unsigned int i = 0; - for (auto arg : args) { - auto buffer = hpx::get_ptr(arg).get(); - void* tmp = buffer->get_raw_pointer(); - args_pointer[i] = tmp; - i++; - } - - // Set CUDA device to be used - cudaSetDevice(this->parent_device_id); - checkCudaError("program::run Error setting the device"); - - //When 0 is passed as the attribute for CUStream, the default stream is used. - //This behavior is explained in http://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#axzz4lWmm2anH - cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, - block.x, block.y, block.z, 0, 0, args_pointer.data(), - 0); - checkCudaError("program::run launch kernel"); - cudaStreamSynchronize(0); - checkCudaError("program::run synchronize kernel"); - + std::string modulename, Dim3 grid, Dim3 block) { + std::vector args_pointer(args.size()); + + // Retrieve the pointers of the data containing parameters for the kernel + unsigned int i = 0; + for (auto arg : args) { + auto buffer = hpx::get_ptr(arg).get(); + void *tmp = buffer->get_raw_pointer(); + args_pointer[i] = tmp; + i++; + } + + // Set CUDA device to be used + cudaSetDevice(this->parent_device_id); + checkCudaError("program::run Error setting the device"); + + // When 0 is passed as the attribute for CUStream, the default stream is used. + // This behavior is explained in + // http://docs.nvidia.com/cuda/cuda-driver-api/stream-sync-behavior.html#axzz4lWmm2anH + cuLaunchKernel(this->kernels[modulename], grid.x, grid.y, grid.z, block.x, + block.y, block.z, 0, 0, args_pointer.data(), 0); + checkCudaError("program::run launch kernel"); + cudaStreamSynchronize(0); + checkCudaError("program::run synchronize kernel"); } #endif -} -} -} - +} // namespace server +} // namespace cuda +} // namespace hpx diff --git a/examples/cuda/build_kernel.cpp b/examples/cuda/build_kernel.cpp index f1430973..cadf5b47 100644 --- a/examples/cuda/build_kernel.cpp +++ b/examples/cuda/build_kernel.cpp @@ -15,133 +15,145 @@ using namespace hpx::cuda; static const char kernel_src[] = - " " - "extern \"C\" __global__ void sum(unsigned int* array, unsigned int* count, unsigned int* n){ \n" - " for (int i = blockDim.x * blockIdx.x + threadIdx.x; \n" - " i < n[0]; \n" - " i += gridDim.x * blockDim.x) \n" - " { \n" - " atomicAdd(&(count[0]), array[i]); \n" - " } \n" - "} \n"; + " " + " " + "extern \"C\" __global__ void sum(unsigned int* array, unsigned int* " + "count, unsigned int* n){ \n" + " for (int i = blockDim.x * blockIdx.x + threadIdx.x; " + " \n" + " i < n[0]; " + " " + " \n" + " i += gridDim.x * blockDim.x) " + " \n" + " { " + " " + " \n" + " atomicAdd(&(count[0]), array[i]); " + " \n" + " } " + " \n" + "} " + " \n"; // hpx_main, is the actual main called by hpx int main(int argc, char* argv[]) { + // Vector for all futures for the data management + std::vector> data_futures; - //Vector for all futures for the data management - std::vector> data_futures; + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } + // Generate Input data + unsigned int* inputData; + cudaMallocHost((void**)&inputData, sizeof(unsigned int) * SIZE); + checkCudaError("Malloc inputData"); - // Generate Input data - unsigned int* inputData; - cudaMallocHost((void**)&inputData, sizeof(unsigned int)*SIZE); - checkCudaError("Malloc inputData"); + // Create a device component from the first device found + device cudaDevice = devices[0]; - // Create a device component from the first device found - device cudaDevice = devices[0]; + for (unsigned int i = 0; i < SIZE; i++) inputData[i] = 1; - for (unsigned int i = 0; i < SIZE; i++) - inputData[i] = 1; + // Create a buffer + buffer outbuffer = + cudaDevice.create_buffer(SIZE * sizeof(unsigned int)).get(); - // Create a buffer - buffer outbuffer = cudaDevice.create_buffer(SIZE * sizeof(unsigned int)).get(); + // Copy input data to the buffer + data_futures.push_back( + outbuffer.enqueue_write(0, SIZE * sizeof(unsigned int), inputData)); - // Copy input data to the buffer - data_futures.push_back(outbuffer.enqueue_write(0, SIZE * sizeof(unsigned int), inputData)); + // Create the hello_world device program + program prog = cudaDevice.create_program_with_source(kernel_src).get(); - // Create the hello_world device program - program prog = cudaDevice.create_program_with_source(kernel_src).get(); + // Add compiler flags for compiling the kernel - // Add compiler flags for compiling the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); + flags.push_back(mode); - flags.push_back(mode); - - // Compile the program + // Compile the program #ifdef DEBUG - data_futures.push_back(prog.build(flags, "sum", 1)); + data_futures.push_back(prog.build(flags, "sum", 1)); #else - data_futures.push_back(prog.build(flags , "sum")); + data_futures.push_back(prog.build(flags, "sum")); #endif - // Create the buffer for the result - unsigned int* result; - cudaMallocHost((void**)&result,sizeof(unsigned int)); - checkCudaError("Malloc result"); - result[0] = 0; - buffer resbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); - data_futures.push_back(resbuffer.enqueue_write(0,sizeof(unsigned int), result)); - - //Create the buffer for the length of the array - unsigned int* n; - cudaMallocHost((void**)&n,sizeof(unsigned int)); - checkCudaError("Malloc size n"); - result[0] = SIZE; - buffer lengthbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); - data_futures.push_back(lengthbuffer.enqueue_write(0,sizeof(unsigned int), n)); - - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; - - //Set the values for the grid dimension - grid.x = 1; - grid.y = 1; - grid.z = 1; - - //Set the values for the block dimension - block.x = 32; - block.y = 1; - block.z = 1; - - //Set the parameter for the kernel, have to be the same order as in the definition - std::vectorargs; - args.push_back(outbuffer); - args.push_back(resbuffer); - args.push_back(lengthbuffer); - - hpx::wait_all(data_futures); - - //Run the kernel at the default stream - auto kernel_future = prog.run(args,"sum",grid,block,0); - - hpx::wait_all(kernel_future); - - //Copy the result back - unsigned int* res = resbuffer.enqueue_read_sync(0,sizeof(unsigned int)); - - std::cout << "Result is " << res[0] << " and is "; - - //Check if result is correct - - if (res[0] != SIZE) - hpx::cout << "wrong" << hpx::endl; - else - hpx::cout << "correct" << hpx::endl; - - cudaFreeHost(n); - checkCudaError("Free n"); - cudaFreeHost(inputData); - checkCudaError("Free inputData"); - cudaFreeHost(result); - checkCudaError("Free result"); - - return EXIT_SUCCESS; + // Create the buffer for the result + unsigned int* result; + cudaMallocHost((void**)&result, sizeof(unsigned int)); + checkCudaError("Malloc result"); + result[0] = 0; + buffer resbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); + data_futures.push_back( + resbuffer.enqueue_write(0, sizeof(unsigned int), result)); + + // Create the buffer for the length of the array + unsigned int* n; + cudaMallocHost((void**)&n, sizeof(unsigned int)); + checkCudaError("Malloc size n"); + result[0] = SIZE; + buffer lengthbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); + data_futures.push_back( + lengthbuffer.enqueue_write(0, sizeof(unsigned int), n)); + + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + // Set the values for the grid dimension + grid.x = 1; + grid.y = 1; + grid.z = 1; + + // Set the values for the block dimension + block.x = 32; + block.y = 1; + block.z = 1; + + // Set the parameter for the kernel, have to be the same order as in the + // definition + std::vector args; + args.push_back(outbuffer); + args.push_back(resbuffer); + args.push_back(lengthbuffer); + + hpx::wait_all(data_futures); + + // Run the kernel at the default stream + auto kernel_future = prog.run(args, "sum", grid, block, 0); + + hpx::wait_all(kernel_future); + + // Copy the result back + unsigned int* res = + resbuffer.enqueue_read_sync(0, sizeof(unsigned int)); + + std::cout << "Result is " << res[0] << " and is "; + + // Check if result is correct + + if (res[0] != SIZE) + hpx::cout << "wrong" << hpx::endl; + else + hpx::cout << "correct" << hpx::endl; + + cudaFreeHost(n); + checkCudaError("Free n"); + cudaFreeHost(inputData); + checkCudaError("Free inputData"); + cudaFreeHost(result); + checkCudaError("Free result"); + + return EXIT_SUCCESS; } - diff --git a/examples/cuda/build_kernel_from_file.cpp b/examples/cuda/build_kernel_from_file.cpp index f9a979b2..6fd156eb 100644 --- a/examples/cuda/build_kernel_from_file.cpp +++ b/examples/cuda/build_kernel_from_file.cpp @@ -15,116 +15,117 @@ using namespace hpx::cuda; // hpx_main, is the actual main called by hpx int main(int argc, char* argv[]) { + // Vector for all futures for the data management + std::vector> data_futures; - //Vector for all futures for the data management - std::vector> data_futures; + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } + // Generate Input data + unsigned int* inputData; + cudaMallocHost((void**)&inputData, sizeof(unsigned int) * SIZE); + checkCudaError("Malloc inputData"); - // Generate Input data - unsigned int* inputData; - cudaMallocHost((void**)&inputData, sizeof(unsigned int)*SIZE); - checkCudaError("Malloc inputData"); + // Create a device component from the first device found + device cudaDevice = devices[0]; - // Create a device component from the first device found - device cudaDevice = devices[0]; + for (unsigned int i = 0; i < SIZE; i++) inputData[i] = 1; - for (unsigned int i = 0; i < SIZE; i++) - inputData[i] = 1; + // Create a buffer + buffer outbuffer = + cudaDevice.create_buffer(SIZE * sizeof(unsigned int)).get(); - // Create a buffer - buffer outbuffer = cudaDevice.create_buffer(SIZE * sizeof(unsigned int)).get(); + // Copy input data to the buffer + data_futures.push_back( + outbuffer.enqueue_write(0, SIZE * sizeof(unsigned int), inputData)); - // Copy input data to the buffer - data_futures.push_back(outbuffer.enqueue_write(0, SIZE * sizeof(unsigned int), inputData)); + // Create the hello_world device program + program prog = cudaDevice.create_program_with_file("example_kernel.cu").get(); - // Create the hello_world device program - program prog = cudaDevice.create_program_with_file("example_kernel.cu").get(); + // Add compiler flags for compiling the kernel - // Add compiler flags for compiling the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); + flags.push_back(mode); - flags.push_back(mode); + // Compile the program + prog.build_sync(flags, "sum2"); - // Compile the program - prog.build_sync(flags,"sum2"); + // Create the buffer for the result + unsigned int* result; + cudaMallocHost((void**)&result, sizeof(unsigned int)); + checkCudaError("Malloc result"); + result[0] = 0; + buffer resbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); + data_futures.push_back( + resbuffer.enqueue_write(0, sizeof(unsigned int), result)); - // Create the buffer for the result - unsigned int* result; - cudaMallocHost((void**)&result,sizeof(unsigned int)); - checkCudaError("Malloc result"); - result[0] = 0; - buffer resbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); - data_futures.push_back(resbuffer.enqueue_write(0,sizeof(unsigned int), result)); + // Create the buffer for the length of the array + unsigned int* n; + cudaMallocHost((void**)&n, sizeof(unsigned int)); + checkCudaError("Malloc size n"); + result[0] = SIZE; + buffer lengthbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); + data_futures.push_back( + lengthbuffer.enqueue_write(0, sizeof(unsigned int), n)); - //Create the buffer for the length of the array - unsigned int* n; - cudaMallocHost((void**)&n,sizeof(unsigned int)); - checkCudaError("Malloc size n"); - result[0] = SIZE; - buffer lengthbuffer = cudaDevice.create_buffer(sizeof(unsigned int)).get(); - data_futures.push_back(lengthbuffer.enqueue_write(0,sizeof(unsigned int), n)); + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; + // Set the values for the grid dimension + grid.x = 1; + grid.y = 1; + grid.z = 1; - //Set the values for the grid dimension - grid.x = 1; - grid.y = 1; - grid.z = 1; + // Set the values for the block dimension + block.x = 32; + block.y = 1; + block.z = 1; - //Set the values for the block dimension - block.x = 32; - block.y = 1; - block.z = 1; + // Set the parameter for the kernel, have to be the same order as in the + // definition + std::vector args; + args.push_back(outbuffer); + args.push_back(resbuffer); + args.push_back(lengthbuffer); - //Set the parameter for the kernel, have to be the same order as in the definition - std::vectorargs; - args.push_back(outbuffer); - args.push_back(resbuffer); - args.push_back(lengthbuffer); + hpx::wait_all(data_futures); - hpx::wait_all(data_futures); + // Run the kernel at the default stream + auto kernel_future = prog.run(args, "sum2", grid, block, 0); - //Run the kernel at the default stream - auto kernel_future = prog.run(args,"sum2",grid,block,0); + hpx::wait_all(kernel_future); - hpx::wait_all(kernel_future); + // Copy the result back + unsigned int* res = + resbuffer.enqueue_read_sync(0, sizeof(unsigned int)); - //Copy the result back - unsigned int* res = resbuffer.enqueue_read_sync(0,sizeof(unsigned int)); + hpx::cout << "Result is " << res[0] << " and is "; - hpx::cout << "Result is " << res[0] << " and is "; + // Check if result is correct - //Check if result is correct + if (res[0] != SIZE) + hpx::cout << "wrong" << hpx::endl; + else + hpx::cout << "correct" << hpx::endl; - if (res[0] != SIZE) - hpx::cout << "wrong" << hpx::endl; - else - hpx::cout << "correct" << hpx::endl; + cudaFreeHost(n); + checkCudaError("Free n"); + cudaFreeHost(inputData); + checkCudaError("Free inputData"); + cudaFreeHost(result); + checkCudaError("Free result"); - cudaFreeHost(n); - checkCudaError("Free n"); - cudaFreeHost(inputData); - checkCudaError("Free inputData"); - cudaFreeHost(result); - checkCudaError("Free result"); - - return EXIT_SUCCESS; + return EXIT_SUCCESS; } - diff --git a/examples/cuda/cuda_list_devices.cpp b/examples/cuda/cuda_list_devices.cpp index 0205f00d..e5a48cc6 100644 --- a/examples/cuda/cuda_list_devices.cpp +++ b/examples/cuda/cuda_list_devices.cpp @@ -12,30 +12,27 @@ // hpx_main, is the actual main called by hpx int main(int argc, char* argv[]) { - { - - //Get list of available CUDA Devices. - std::vector devices = hpx::cuda::get_all_devices(1, - 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - hpx::cout << hpx::endl << "Devices:" << hpx::endl << hpx::endl; - - // print a lot of information about every device - for (auto &device : devices) { - - device.get_cuda_info(); - // add newline before starting a new device - hpx::cout << hpx::endl; - - } - } - -// End the program - return hpx::finalize(); + { + // Get list of available CUDA Devices. + std::vector devices = + hpx::cuda::get_all_devices(1, 0).get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } + + hpx::cout << hpx::endl << "Devices:" << hpx::endl << hpx::endl; + + // print a lot of information about every device + for (auto& device : devices) { + device.get_cuda_info(); + // add newline before starting a new device + hpx::cout << hpx::endl; + } + } + + // End the program + return hpx::finalize(); } diff --git a/examples/cuda/cuda_list_extended_devices.cpp b/examples/cuda/cuda_list_extended_devices.cpp index 3834f799..bb82927d 100644 --- a/examples/cuda/cuda_list_extended_devices.cpp +++ b/examples/cuda/cuda_list_extended_devices.cpp @@ -12,30 +12,27 @@ // hpx_main, is the actual main called by hpx int main(int argc, char* argv[]) { - { - - //Get list of available CUDA Devices. - std::vector devices = hpx::cuda::get_all_devices(1, - 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - hpx::cout << hpx::endl << "Devices:" << hpx::endl << hpx::endl; - - // print a lot of information about every device - for (auto &device : devices) { - - device.get_extended_cuda_info(); - // add newline before starting a new device - hpx::cout << hpx::endl; - - } - } - -// End the program - return hpx::finalize(); + { + // Get list of available CUDA Devices. + std::vector devices = + hpx::cuda::get_all_devices(1, 0).get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } + + hpx::cout << hpx::endl << "Devices:" << hpx::endl << hpx::endl; + + // print a lot of information about every device + for (auto& device : devices) { + device.get_extended_cuda_info(); + // add newline before starting a new device + hpx::cout << hpx::endl; + } + } + + // End the program + return hpx::finalize(); } diff --git a/examples/cuda/get_device_id.cpp b/examples/cuda/get_device_id.cpp index f341e9e6..29bdc002 100644 --- a/examples/cuda/get_device_id.cpp +++ b/examples/cuda/get_device_id.cpp @@ -12,38 +12,35 @@ using namespace hpx::cuda; int main(int argc, char* argv[]) { + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } - - // Create a device component from the first device found - device cudaDevice_0 = devices[0]; - - // Create a buffer - buffer test_buffer_0 = cudaDevice_0.create_buffer(sizeof(int)).get(); - - // Get buffer parent id - int device_id_0 = test_buffer_0.get_device_id().get(); - std::cout << device_id_0 << std::endl; - - // Comment the following section in case there's only one device - // Create a device component from the second device found - device cudaDevice_1 = devices[1]; - - // Create a buffer - buffer test_buffer_1 = cudaDevice_1.create_buffer(sizeof(int)).get(); - - // Get buffer parent id - int device_id_1 = test_buffer_1.get_device_id().get(); - std::cout << device_id_1 << std::endl; - - return EXIT_SUCCESS; -} + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } + + // Create a device component from the first device found + device cudaDevice_0 = devices[0]; + + // Create a buffer + buffer test_buffer_0 = cudaDevice_0.create_buffer(sizeof(int)).get(); + + // Get buffer parent id + int device_id_0 = test_buffer_0.get_device_id().get(); + std::cout << device_id_0 << std::endl; + // Comment the following section in case there's only one device + // Create a device component from the second device found + device cudaDevice_1 = devices[1]; + // Create a buffer + buffer test_buffer_1 = cudaDevice_1.create_buffer(sizeof(int)).get(); + + // Get buffer parent id + int device_id_1 = test_buffer_1.get_device_id().get(); + std::cout << device_id_1 << std::endl; + + return EXIT_SUCCESS; +} diff --git a/examples/cuda/mandelbrot/mandelbrot.cpp b/examples/cuda/mandelbrot/mandelbrot.cpp index 8f733cfd..4cf25c3e 100644 --- a/examples/cuda/mandelbrot/mandelbrot.cpp +++ b/examples/cuda/mandelbrot/mandelbrot.cpp @@ -17,202 +17,188 @@ using std::atoi; using namespace hpx::cuda; //########################################################################### -//Main +// Main //########################################################################### int main(int argc, char* argv[]) { - - //Reading a list of available devices in hpx locality - std::vector devices = get_all_devices(2, 0).get(); - - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found! Terminating...." << hpx::endl; - return hpx::finalize(); - } - - if (argc != 5) { - std::cout << "Usage: " << argv[0] << " width height" << std::endl; - exit(1); - } - - int width = atoi(argv[1]); - int height = atoi(argv[2]); - int iterations = atoi(argv[3]); - int numDevices = atoi(argv[4]); - - //Vector for all futures for the data management - std::vector < std::vector>>data_futures(numDevices); - - char* image; - char* mainImage; - - std::vector> writeImages; - - for (size_t it = 0; it < iterations; it++) { - - timer_start(); - int currentWidth = width * std::pow(2,it + 1) /2; - int currentHeight = height * std::pow(2,it + 1) /2; - const int bytes = sizeof(char) * currentWidth * currentHeight * 3; - int n = currentWidth * currentHeight * 3; - - std::cout << bytes << ","; - //Malloc Host - cudaMallocHost((void**) &image, bytes); - checkCudaError("Malloc image"); - char* mainImage = (char*) malloc(bytes); - - std::vector args; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; - - block.x = 16; - block.y = 16; - block.z = 1; - - grid.x = currentWidth / block.x; - grid.y = 1 + std::ceil(currentHeight / (block.y * numDevices)); - grid.z = 1; - - std::vector < hpx::future < program >> progBuildVector; - std::vector progVector; - - //creating vector of futures - std::vector> kernelFutures; - - for (int j = 0; j < numDevices; j++) { - progBuildVector.push_back( - devices[j].create_program_with_file( - "mandel_brot_kernel.cu")); - - } - - hpx::wait_all (progBuildVector); - - std::vector> progCompileVector; - - for (int j = 0; j < numDevices; j++) { - progVector.push_back(progBuildVector[j].get()); - - //Compile with the kernel - std::vector < std::string > flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string( - devices[j].get_device_architecture_major().get())); - mode.append( - std::to_string( - devices[j].get_device_architecture_minor().get())); - - flags.push_back(mode); - - progCompileVector.push_back(progVector[j].build(flags, "kernel")); - - } - - std::vector < std::vector>>bufferFutures( - numDevices); - //Buffer vectors - std::vector imageBufferVector; - std::vector widthBufferVector; - std::vector heightBufferVector; - std::vector yStartBufferVector; - std::vector nBufferVector; - - int yStart[numDevices]; - //creating buffers - for (int j = 0; j < numDevices; j++) { - //Image buffer - bufferFutures[j].push_back(devices[j].create_buffer(bytes)); - //Width buffer - bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); - //Height buffer - bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); - // yStart buffer - bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); - // n buffer - bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); - - hpx::wait_all (bufferFutures[j]); - - yStart[j] = j * currentHeight / numDevices; - - imageBufferVector.push_back(bufferFutures[j][0].get()); - data_futures[j].push_back( - imageBufferVector[j].enqueue_write(0, bytes, image)); - widthBufferVector.push_back(bufferFutures[j][1].get()); - data_futures[j].push_back( - widthBufferVector[j].enqueue_write(0, sizeof(int), - ¤tWidth)); - heightBufferVector.push_back(bufferFutures[j][2].get()); - data_futures[j].push_back( - heightBufferVector[j].enqueue_write(0, sizeof(int), - ¤tHeight)); - yStartBufferVector.push_back(bufferFutures[j][3].get()); - data_futures[j].push_back( - yStartBufferVector[j].enqueue_write(0, sizeof(int), - &yStart[j])); - - nBufferVector.push_back(bufferFutures[j][4].get()); - data_futures[j].push_back( - nBufferVector[j].enqueue_write(0, sizeof(int), &n)); - - //Synchronize copy to buffer - hpx::wait_all (data_futures[j]); - //wait for program to build on all devices - hpx::wait_all(progCompileVector); - - //Launch the kernel - args.push_back(imageBufferVector[j]); - args.push_back(widthBufferVector[j]); - args.push_back(heightBufferVector[j]); - args.push_back(yStartBufferVector[j]); - args.push_back(nBufferVector[j]); + // Reading a list of available devices in hpx locality + std::vector devices = get_all_devices(2, 0).get(); + + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found! Terminating...." << hpx::endl; + return hpx::finalize(); + } + + if (argc != 5) { + std::cout << "Usage: " << argv[0] << " width height" << std::endl; + exit(1); + } + + int width = atoi(argv[1]); + int height = atoi(argv[2]); + int iterations = atoi(argv[3]); + int numDevices = atoi(argv[4]); + + // Vector for all futures for the data management + std::vector>> data_futures(numDevices); + + char* image; + char* mainImage; + + std::vector> writeImages; + + for (size_t it = 0; it < iterations; it++) { + timer_start(); + int currentWidth = width * std::pow(2, it + 1) / 2; + int currentHeight = height * std::pow(2, it + 1) / 2; + const int bytes = sizeof(char) * currentWidth * currentHeight * 3; + int n = currentWidth * currentHeight * 3; + + std::cout << bytes << ","; + // Malloc Host + cudaMallocHost((void**)&image, bytes); + checkCudaError("Malloc image"); + char* mainImage = (char*)malloc(bytes); + + std::vector args; + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + + block.x = 16; + block.y = 16; + block.z = 1; + + grid.x = currentWidth / block.x; + grid.y = 1 + std::ceil(currentHeight / (block.y * numDevices)); + grid.z = 1; + + std::vector> progBuildVector; + std::vector progVector; + + // creating vector of futures + std::vector> kernelFutures; + + for (int j = 0; j < numDevices; j++) { + progBuildVector.push_back( + devices[j].create_program_with_file("mandel_brot_kernel.cu")); + } + + hpx::wait_all(progBuildVector); + + std::vector> progCompileVector; + + for (int j = 0; j < numDevices; j++) { + progVector.push_back(progBuildVector[j].get()); + + // Compile with the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append( + std::to_string(devices[j].get_device_architecture_major().get())); + mode.append( + std::to_string(devices[j].get_device_architecture_minor().get())); + + flags.push_back(mode); + + progCompileVector.push_back(progVector[j].build(flags, "kernel")); + } + + std::vector>> bufferFutures( + numDevices); + // Buffer vectors + std::vector imageBufferVector; + std::vector widthBufferVector; + std::vector heightBufferVector; + std::vector yStartBufferVector; + std::vector nBufferVector; + + int yStart[numDevices]; + // creating buffers + for (int j = 0; j < numDevices; j++) { + // Image buffer + bufferFutures[j].push_back(devices[j].create_buffer(bytes)); + // Width buffer + bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); + // Height buffer + bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); + // yStart buffer + bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); + // n buffer + bufferFutures[j].push_back(devices[j].create_buffer(sizeof(int))); + + hpx::wait_all(bufferFutures[j]); + + yStart[j] = j * currentHeight / numDevices; + + imageBufferVector.push_back(bufferFutures[j][0].get()); + data_futures[j].push_back( + imageBufferVector[j].enqueue_write(0, bytes, image)); + widthBufferVector.push_back(bufferFutures[j][1].get()); + data_futures[j].push_back( + widthBufferVector[j].enqueue_write(0, sizeof(int), ¤tWidth)); + heightBufferVector.push_back(bufferFutures[j][2].get()); + data_futures[j].push_back( + heightBufferVector[j].enqueue_write(0, sizeof(int), ¤tHeight)); + yStartBufferVector.push_back(bufferFutures[j][3].get()); + data_futures[j].push_back( + yStartBufferVector[j].enqueue_write(0, sizeof(int), &yStart[j])); + + nBufferVector.push_back(bufferFutures[j][4].get()); + data_futures[j].push_back( + nBufferVector[j].enqueue_write(0, sizeof(int), &n)); + + // Synchronize copy to buffer + hpx::wait_all(data_futures[j]); + // wait for program to build on all devices + hpx::wait_all(progCompileVector); + + // Launch the kernel + args.push_back(imageBufferVector[j]); + args.push_back(widthBufferVector[j]); + args.push_back(heightBufferVector[j]); + args.push_back(yStartBufferVector[j]); + args.push_back(nBufferVector[j]); #ifdef HPXCL_CUDA_WITH_STREAMS - kernelFutures.push_back(progVector[j].run(args, "kernel", grid, block, args)); + kernelFutures.push_back( + progVector[j].run(args, "kernel", grid, block, args)); #else - kernelFutures.push_back( - progVector[j].run(args, "kernel", grid, block)); + kernelFutures.push_back(progVector[j].run(args, "kernel", grid, block)); #endif - //for multiple runs - args.clear(); - - } - - //wait for all the kernel futures to return - hpx::wait_all(kernelFutures); - - //write images to file - std::shared_ptr> img_data; - - //Stich multiple images - for (int j = 0; j < numDevices; j++) { - image = imageBufferVector.at(j).enqueue_read_sync(0, - bytes / numDevices); - std::copy(image, - image + currentWidth * (currentHeight / numDevices) * 3 - 1, - mainImage - + currentWidth * (currentHeight / numDevices) * 3 - * j); - } - img_data = std::make_shared < std::vector - > (mainImage, mainImage + bytes); - - writeImages.push_back( - hpx::async(save_png_it, img_data, currentWidth, currentHeight, - it)); - //save_png_it(img_data, currentWidth, currentHeight,i); - std::cout << timer_stop() << std::endl; - } - - hpx::wait_all(writeImages); - - //Free Memory - cudaFreeHost(image); - checkCudaError("Free image"); - cudaFreeHost(mainImage); - checkCudaError("Free mainImage"); - - return EXIT_SUCCESS; + // for multiple runs + args.clear(); + } + + // wait for all the kernel futures to return + hpx::wait_all(kernelFutures); + + // write images to file + std::shared_ptr> img_data; + + // Stich multiple images + for (int j = 0; j < numDevices; j++) { + image = imageBufferVector.at(j).enqueue_read_sync( + 0, bytes / numDevices); + std::copy( + image, image + currentWidth * (currentHeight / numDevices) * 3 - 1, + mainImage + currentWidth * (currentHeight / numDevices) * 3 * j); + } + img_data = + std::make_shared>(mainImage, mainImage + bytes); + + writeImages.push_back( + hpx::async(save_png_it, img_data, currentWidth, currentHeight, it)); + // save_png_it(img_data, currentWidth, currentHeight,i); + std::cout << timer_stop() << std::endl; + } + + hpx::wait_all(writeImages); + + // Free Memory + cudaFreeHost(image); + checkCudaError("Free image"); + cudaFreeHost(mainImage); + checkCudaError("Free mainImage"); + + return EXIT_SUCCESS; } diff --git a/examples/cuda/p2p_test.cpp b/examples/cuda/p2p_test.cpp index 9df15f21..8d68a3f9 100644 --- a/examples/cuda/p2p_test.cpp +++ b/examples/cuda/p2p_test.cpp @@ -11,178 +11,163 @@ using namespace hpx::cuda; #define SIZE 8 int main(int argc, char* argv[]) { + auto start = std::chrono::steady_clock::now(); - auto start = std::chrono::steady_clock::now(); + std::vector> data_futures; - std::vector> data_futures; + std::vector devices = get_all_devices(2, 0).get(); - std::vector devices = get_all_devices(2, 0).get(); + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } + int* input_1; + cudaMallocHost((void**)&input_1, sizeof(int) * SIZE); + checkCudaError("Malloc inputData"); - + for (int i = 0; i < SIZE; i++) { + input_1[i] = 1; + } - int* input_1; - cudaMallocHost((void**)&input_1, sizeof(int) * SIZE); - checkCudaError("Malloc inputData"); + for (int i = 0; i < SIZE - 1; i++) { + std::cout << input_1[i] << ", "; + } + std::cout << input_1[SIZE - 1] << std::endl; - for(int i = 0; i < SIZE; i++){ - input_1[i] = 1; - } + device cudaDevice_1 = devices[0]; - for (int i = 0; i < SIZE-1; i++){ - std::cout << input_1[i] << ", "; - } - std::cout << input_1[SIZE-1] << std::endl; + buffer inbuffer_1 = cudaDevice_1.create_buffer(sizeof(int) * SIZE).get(); - device cudaDevice_1 = devices[0]; + data_futures.push_back( + inbuffer_1.enqueue_write(0, sizeof(int) * SIZE, input_1)); - buffer inbuffer_1 = cudaDevice_1.create_buffer(sizeof(int) * SIZE).get(); + program prog_1 = + cudaDevice_1.create_program_with_file("p2p_test_kernel.cu").get(); - data_futures.push_back(inbuffer_1.enqueue_write(0, sizeof(int) * SIZE, input_1)); + device cudaDevice_2 = devices[1]; - program prog_1 = cudaDevice_1.create_program_with_file("p2p_test_kernel.cu").get(); + buffer inbuffer_2 = cudaDevice_2.create_buffer(sizeof(int) * SIZE).get(); + program prog_2 = + cudaDevice_2.create_program_with_file("p2p_test_kernel.cu").get(); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Add compiler flags for compiling the kernel + std::vector flags_1; + std::string mode_1 = "--gpu-architecture=compute_"; + mode_1.append( + std::to_string(cudaDevice_1.get_device_architecture_major().get())); + mode_1.append( + std::to_string(cudaDevice_1.get_device_architecture_minor().get())); + flags_1.push_back(mode_1); - device cudaDevice_2 = devices[1]; + std::vector flags_2; + std::string mode_2 = "--gpu-architecture=compute_"; + mode_2.append( + std::to_string(cudaDevice_2.get_device_architecture_major().get())); + mode_2.append( + std::to_string(cudaDevice_2.get_device_architecture_minor().get())); + flags_2.push_back(mode_2); - buffer inbuffer_2 = cudaDevice_2.create_buffer(sizeof(int) * SIZE).get(); + // Compile the program + prog_1.build_sync(flags_1, "multiply_1"); + prog_2.build_sync(flags_2, "multiply_2"); + //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - program prog_2 = cudaDevice_2.create_program_with_file("p2p_test_kernel.cu").get(); + int* output_1; + cudaMallocHost((void**)&output_1, sizeof(int) * SIZE); + checkCudaError("Malloc result"); + buffer outbuffer_1 = cudaDevice_1.create_buffer(sizeof(int) * SIZE).get(); + data_futures.push_back( + outbuffer_1.enqueue_write(0, sizeof(int) * SIZE, output_1)); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// - // Add compiler flags for compiling the kernel - std::vector flags_1; - std::string mode_1 = "--gpu-architecture=compute_"; - mode_1.append(std::to_string(cudaDevice_1.get_device_architecture_major().get())); - mode_1.append(std::to_string(cudaDevice_1.get_device_architecture_minor().get())); - flags_1.push_back(mode_1); + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; - std::vector flags_2; - std::string mode_2 = "--gpu-architecture=compute_"; - mode_2.append(std::to_string(cudaDevice_2.get_device_architecture_major().get())); - mode_2.append(std::to_string(cudaDevice_2.get_device_architecture_minor().get())); - flags_2.push_back(mode_2); + // Set the values for the grid dimension + grid.x = 1; + grid.y = 1; + grid.z = 1; - // Compile the program - prog_1.build_sync(flags_1, "multiply_1"); - prog_2.build_sync(flags_2, "multiply_2"); -//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// + // Set the values for the block dimension + block.x = 1; + block.y = 1; + block.z = 1; - int* output_1; - cudaMallocHost((void**)&output_1, sizeof(int) * SIZE); - checkCudaError("Malloc result"); - - - buffer outbuffer_1 = cudaDevice_1.create_buffer(sizeof(int) * SIZE).get(); - data_futures.push_back(outbuffer_1.enqueue_write(0, sizeof(int) * SIZE, output_1)); + int* n; + cudaMallocHost((void**)&n, sizeof(int)); + n[0] = SIZE; + buffer sizebuffer_1 = cudaDevice_1.create_buffer(sizeof(int)).get(); + data_futures.push_back(sizebuffer_1.enqueue_write(0, sizeof(int), n)); - // Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; + // Set the parameter for the kernel, have to be the same order as in the + // definition + std::vector args_1; + args_1.push_back(inbuffer_1); + args_1.push_back(outbuffer_1); + args_1.push_back(sizebuffer_1); - // Set the values for the grid dimension - grid.x = 1; - grid.y = 1; - grid.z = 1; + hpx::wait_all(data_futures); - // Set the values for the block dimension - block.x = 1; - block.y = 1; - block.z = 1; + // Run the kernel at the default stream + auto kernel_future_1 = prog_1.run(args_1, "multiply_1", grid, block, 0); + hpx::wait_all(kernel_future_1); + // Copy the result back + int* res_1 = outbuffer_1.enqueue_read_sync(0, sizeof(int) * SIZE); - int* n; - cudaMallocHost((void**)&n, sizeof(int)); - n[0] = SIZE; + for (int i = 0; i < SIZE - 1; i++) { + std::cout << res_1[i] << ", "; + } + std::cout << res_1[SIZE - 1] << std::endl; - buffer sizebuffer_1 = cudaDevice_1.create_buffer(sizeof(int)).get(); - data_futures.push_back(sizebuffer_1.enqueue_write(0, sizeof(int), n)); + auto f = outbuffer_1.p2p_copy(inbuffer_2.get_device_pointer().get(), + inbuffer_2.get_device_id().get(), + sizeof(int) * SIZE); + f.get(); + int* output_2; + cudaMallocHost((void**)&output_2, sizeof(int) * SIZE); + checkCudaError("Malloc result"); - // Set the parameter for the kernel, have to be the same order as in the definition - std::vector args_1; - args_1.push_back(inbuffer_1); - args_1.push_back(outbuffer_1); - args_1.push_back(sizebuffer_1); + buffer outbuffer_2 = cudaDevice_2.create_buffer(sizeof(int) * SIZE).get(); + data_futures.push_back( + outbuffer_2.enqueue_write(0, sizeof(int) * SIZE, output_2)); - hpx::wait_all(data_futures); + buffer sizebuffer_2 = cudaDevice_2.create_buffer(sizeof(int)).get(); + data_futures.push_back(sizebuffer_2.enqueue_write(0, sizeof(int), n)); - //Run the kernel at the default stream - auto kernel_future_1 = prog_1.run(args_1, "multiply_1", grid, block, 0); + // Set the parameter for the kernel, have to be the same order as in the + // definition + std::vector args_2; + args_2.push_back(inbuffer_2); + args_2.push_back(outbuffer_2); + args_2.push_back(sizebuffer_2); + + hpx::wait_all(data_futures); - hpx::wait_all(kernel_future_1); + // Run the kernel at the default stream + auto kernel_future_2 = prog_2.run(args_2, "multiply_2", grid, block, 0); + + hpx::wait_all(kernel_future_2); - - //Copy the result back - int* res_1 = outbuffer_1.enqueue_read_sync(0, sizeof(int) * SIZE); + // Copy the result back + int* res_2 = outbuffer_2.enqueue_read_sync(0, sizeof(int) * SIZE); - for (int i = 0; i < SIZE-1; i++){ - std::cout << res_1[i] << ", "; - } - std::cout << res_1[SIZE-1] << std::endl; + for (int i = 0; i < SIZE - 1; i++) { + std::cout << res_2[i] << ", "; + } + std::cout << res_2[SIZE - 1] << std::endl; + auto end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n"; - - auto f = outbuffer_1.p2p_copy(inbuffer_2.get_device_pointer().get(), inbuffer_2.get_device_id().get(), sizeof(int) * SIZE); - - f.get(); - - - - int* output_2; - cudaMallocHost((void**)&output_2, sizeof(int) * SIZE); - checkCudaError("Malloc result"); - - - buffer outbuffer_2 = cudaDevice_2.create_buffer(sizeof(int) * SIZE).get(); - data_futures.push_back(outbuffer_2.enqueue_write(0, sizeof(int) * SIZE, output_2)); - - buffer sizebuffer_2 = cudaDevice_2.create_buffer(sizeof(int)).get(); - data_futures.push_back(sizebuffer_2.enqueue_write(0, sizeof(int), n)); - - // Set the parameter for the kernel, have to be the same order as in the definition - std::vector args_2; - args_2.push_back(inbuffer_2); - args_2.push_back(outbuffer_2); - args_2.push_back(sizebuffer_2); - - hpx::wait_all(data_futures); - - //Run the kernel at the default stream - auto kernel_future_2 = prog_2.run(args_2, "multiply_2", grid, block, 0); - - hpx::wait_all(kernel_future_2); - - //Copy the result back - int* res_2 = outbuffer_2.enqueue_read_sync(0, sizeof(int) * SIZE); - - for (int i = 0; i < SIZE-1; i++){ - std::cout << res_2[i] << ", "; - } - std::cout << res_2[SIZE-1] << std::endl; - - - - - - - - - - - auto end = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_seconds = end-start; - std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n"; - - return EXIT_SUCCESS; + return EXIT_SUCCESS; } - diff --git a/examples/cuda/shared_memory.cpp b/examples/cuda/shared_memory.cpp index 0a9ca439..71469edc 100644 --- a/examples/cuda/shared_memory.cpp +++ b/examples/cuda/shared_memory.cpp @@ -1,5 +1,5 @@ // Copyright (c) 2021 Patrick Diehl -// 2021 Pedro Barbosa +// 2021 Pedro Barbosa // // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) @@ -16,121 +16,112 @@ using namespace hpx::cuda; #define SIZE 8 -void print_array(int* a){ - for(int i = 0; i < SIZE-1; i++){ - std::cout << a[i] << ", "; - } - std::cout << a[SIZE-1] << std::endl; +void print_array(int* a) { + for (int i = 0; i < SIZE - 1; i++) { + std::cout << a[i] << ", "; + } + std::cout << a[SIZE - 1] << std::endl; } - - int main(int argc, char* argv[]) { + auto start = std::chrono::steady_clock::now(); - auto start = std::chrono::steady_clock::now(); - - // Vector for all futures for the data management - std::vector> data_futures; - - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); - - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } + // Vector for all futures for the data management + std::vector> data_futures; + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - // Generate Input data - int* input; - cudaMallocHost((void**)&input, sizeof(int) * SIZE); - checkCudaError("Malloc inputData"); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - for(int i = 0; i < SIZE; i++){ - input[i] = i; - } + // Generate Input data + int* input; + cudaMallocHost((void**)&input, sizeof(int) * SIZE); + checkCudaError("Malloc inputData"); - print_array(input); + for (int i = 0; i < SIZE; i++) { + input[i] = i; + } - // Create a device component from the first device found - device cudaDevice = devices[0]; + print_array(input); - // Create a buffer - buffer inbuffer = cudaDevice.create_buffer(sizeof(int) * SIZE).get(); + // Create a device component from the first device found + device cudaDevice = devices[0]; - // Copy input data to the buffer - data_futures.push_back(inbuffer.enqueue_write(0, sizeof(int) * SIZE, input)); + // Create a buffer + buffer inbuffer = cudaDevice.create_buffer(sizeof(int) * SIZE).get(); - // Create the example_shared_kernel device program - program prog = cudaDevice.create_program_with_file("example_shared_kernel.cu").get(); + // Copy input data to the buffer + data_futures.push_back(inbuffer.enqueue_write(0, sizeof(int) * SIZE, input)); + // Create the example_shared_kernel device program + program prog = + cudaDevice.create_program_with_file("example_shared_kernel.cu").get(); - // Add compiler flags for compiling the kernel - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); - mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); - flags.push_back(mode); + // Add compiler flags for compiling the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); + flags.push_back(mode); - // Compile the program - prog.build_sync(flags, "dynamicReverse"); + // Compile the program + prog.build_sync(flags, "dynamicReverse"); + // Create the buffer for the length of the array + int* n; + cudaMallocHost((void**)&n, sizeof(int)); + checkCudaError("Malloc size n"); + n[0] = SIZE; + buffer sizebuffer = cudaDevice.create_buffer(sizeof(int)).get(); + data_futures.push_back(sizebuffer.enqueue_write(0, sizeof(int), n)); - // Create the buffer for the length of the array - int* n; - cudaMallocHost((void**)&n, sizeof(int)); - checkCudaError("Malloc size n"); - n[0] = SIZE; - buffer sizebuffer = cudaDevice.create_buffer(sizeof(int)).get(); - data_futures.push_back(sizebuffer.enqueue_write(0, sizeof(int), n)); + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + // Set the values for the grid dimension + grid.x = 1; + grid.y = 1; + grid.z = 1; - // Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; + // Set the values for the block dimension + block.x = SIZE; + block.y = 1; + block.z = 1; - // Set the values for the grid dimension - grid.x = 1; - grid.y = 1; - grid.z = 1; + // Set the parameter for the kernel, have to be the same order as in the + // definition + std::vector args; + args.push_back(inbuffer); + args.push_back(sizebuffer); - // Set the values for the block dimension - block.x = SIZE; - block.y = 1; - block.z = 1; + hpx::wait_all(data_futures); + // Run the kernel at the default stream + auto kernel_future = + prog.run(args, "dynamicReverse", grid, block, SIZE * sizeof(int)); - // Set the parameter for the kernel, have to be the same order as in the definition - std::vector args; - args.push_back(inbuffer); - args.push_back(sizebuffer); + hpx::wait_all(kernel_future); - hpx::wait_all(data_futures); + // Copy the result back + int* res = inbuffer.enqueue_read_sync(0, SIZE * sizeof(int)); - // Run the kernel at the default stream - auto kernel_future = prog.run(args, "dynamicReverse", grid, block, SIZE*sizeof(int)); + // Print the result + print_array(res); - hpx::wait_all(kernel_future); + cudaFreeHost(n); + checkCudaError("Free n"); + cudaFreeHost(input); + checkCudaError("Free input"); - - // Copy the result back - int* res = inbuffer.enqueue_read_sync(0, SIZE*sizeof(int)); - - // Print the result - print_array(res); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n"; - cudaFreeHost(n); - checkCudaError("Free n"); - cudaFreeHost(input); - checkCudaError("Free input"); - - - auto end = std::chrono::steady_clock::now(); - std::chrono::duration elapsed_seconds = end-start; - std::cout << "elapsed time: " << elapsed_seconds.count() << "s\n"; - - return EXIT_SUCCESS; + return EXIT_SUCCESS; } - - diff --git a/examples/cuda/streams.cpp b/examples/cuda/streams.cpp index 6cc393d8..fdccf182 100644 --- a/examples/cuda/streams.cpp +++ b/examples/cuda/streams.cpp @@ -14,127 +14,126 @@ using namespace hpx::cuda; //########################################################################### -//Kernels +// Kernels //########################################################################### static const char kernel_src[] = -"extern \"C\" __global__ void kernel(float* in) { \n" - " \n" - " size_t i = threadIdx.x + blockIdx.x * blockDim.x; \n" - " float x = (float) i; \n" - " float s = sinf(x); \n" - " float c = cosf(x); \n" - " in[i] = in[i] + sqrtf(s * s + c * c); \n" - " \n" - "} \n"; + "extern \"C\" __global__ void kernel(float* in) { " + " \n" + " " + " \n" + " size_t i = threadIdx.x + blockIdx.x * blockDim.x; " + " \n" + " float x = (float) i; " + " \n" + " float s = sinf(x); " + " \n" + " float c = cosf(x); " + " \n" + " in[i] = in[i] + sqrtf(s * s + c * c); " + " \n" + " " + " \n" + "} " + " \n"; //########################################################################### -//Main +// Main //########################################################################### -int main(int argc, char*argv[]) { +int main(int argc, char* argv[]) { + // Get list of available Cuda Devices. + std::vector devices = get_all_devices(2, 0).get(); - // Get list of available Cuda Devices. - std::vector devices = get_all_devices(2, 0).get(); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No CUDA devices found!" << hpx::endl; + return hpx::finalize(); + } - // Check whether there are any devices - if (devices.size() < 1) { - hpx::cerr << "No CUDA devices found!" << hpx::endl; - return hpx::finalize(); - } + const int blockSize = 256, nStreams = 4; - const int blockSize = 256, nStreams = 4; + if (argc != 2) { + std::cout << "Usage: " << argv[0] << " n -> 2^n*1024*" << blockSize << "*" + << nStreams << " elements" << std::endl; + exit(1); + } - if (argc != 2) { - std::cout << "Usage: " << argv[0] << " n -> 2^n*1024*" << blockSize - << "*" << nStreams << " elements" << std::endl; - exit(1); - } + double time = 0; + size_t count = atoi(argv[1]); - double time = 0; - size_t count = atoi(argv[1]); + const int n = pow(2, count) * 1024 * blockSize * nStreams; + const int streamSize = n / nStreams; + const int streamBytes = streamSize * sizeof(float); + const int bytes = n * sizeof(float); - const int n = pow(2,count) * 1024 * blockSize * nStreams; - const int streamSize = n / nStreams; - const int streamBytes = streamSize * sizeof(float); - const int bytes = n * sizeof(float); + // Malloc Host + float* in; + cudaMallocHost((void**)&in, bytes); + checkCudaError("Malloc in"); - //Malloc Host - float* in; - cudaMallocHost((void**) &in, bytes); - checkCudaError("Malloc in"); + memset(in, 0, bytes); - memset(in, 0, bytes); + // Create a device component from the first device found + device cudaDevice = devices[0]; - // Create a device component from the first device found - device cudaDevice = devices[0]; + // Create the hello_world device program + program prog = cudaDevice.create_program_with_source(kernel_src).get(); - // Create the hello_world device program - program prog = cudaDevice.create_program_with_source(kernel_src).get(); + // Add compiler flags for compiling the kernel + std::vector flags; + std::string mode = "--gpu-architecture=compute_"; + mode.append(std::to_string(cudaDevice.get_device_architecture_major().get())); + mode.append(std::to_string(cudaDevice.get_device_architecture_minor().get())); - // Add compiler flags for compiling the kernel - std::vector flags; - std::string mode = "--gpu-architecture=compute_"; - mode.append( - std::to_string(cudaDevice.get_device_architecture_major().get())); - mode.append( - std::to_string(cudaDevice.get_device_architecture_minor().get())); + flags.push_back(mode); - flags.push_back(mode); + auto f = prog.build(flags, "kernel"); - auto f = prog.build(flags, "kernel"); + std::vector bufferIn; + for (size_t i = 0; i < nStreams; i++) { + bufferIn.push_back(cudaDevice.create_buffer(streamBytes).get()); + } - std::vector bufferIn; - for (size_t i = 0; i < nStreams; i++) - { - bufferIn.push_back(cudaDevice.create_buffer(streamBytes).get()); + for (size_t i = 0; i < nStreams; i++) { + bufferIn[i].enqueue_write(i * streamSize, streamBytes, in); + } - } + std::vector args; + // Generate the grid and block dim + hpx::cuda::server::program::Dim3 grid; + hpx::cuda::server::program::Dim3 block; + // Set the values for the grid dimension + grid.x = streamSize / blockSize; + grid.y = 1; + grid.z = 1; + // Set the values for the block dimension + block.x = blockSize; + block.y = 1; + block.z = 1; - for (size_t i = 0; i < nStreams; i++) - { + // hpx::wait_all(dependencies); - bufferIn[i].enqueue_write(i*streamSize,streamBytes,in); - } + std::vector> kernelFutures; + hpx::wait_all(f); + for (size_t i = 0; i < nStreams; i++) { + args.push_back(bufferIn[i]); +#ifdef HPXCL_CUDA_WITH_STREAMS + kernelFutures.push_back(prog.run(args, "kernel", grid, block, 0)); +#else + kernelFutures.push_back(prog.run(args, "kernel", grid, block, 0)); +#endif + args.clear(); + } - std::vector args; - //Generate the grid and block dim - hpx::cuda::server::program::Dim3 grid; - hpx::cuda::server::program::Dim3 block; + hpx::wait_all(kernelFutures); - //Set the values for the grid dimension - grid.x = streamSize / blockSize; - grid.y = 1; - grid.z = 1; + // Clean + cudaFreeHost(in); + checkCudaError("Free in"); - //Set the values for the block dimension - block.x = blockSize; - block.y = 1; - block.z = 1; - - //hpx::wait_all(dependencies); - - std::vector> kernelFutures; - hpx::wait_all(f); - for (size_t i = 0; i < nStreams; i++) - { - args.push_back(bufferIn[i]); - #ifdef HPXCL_CUDA_WITH_STREAMS - kernelFutures.push_back(prog.run(args, "kernel", grid, block,0)); - #else - kernelFutures.push_back(prog.run(args, "kernel", grid, block,0)); - #endif - args.clear(); - } - - hpx::wait_all(kernelFutures); - - //Clean - cudaFreeHost(in); - checkCudaError("Free in"); - - return EXIT_SUCCESS; + return EXIT_SUCCESS; } diff --git a/examples/opencl/benchmark_vector/directcl.hpp b/examples/opencl/benchmark_vector/directcl.hpp index 61268845..a60fe238 100644 --- a/examples/opencl/benchmark_vector/directcl.hpp +++ b/examples/opencl/benchmark_vector/directcl.hpp @@ -14,31 +14,31 @@ #include - -static cl_context directcl_context; -static cl_command_queue directcl_command_queue; -static cl_program directcl_program; -static cl_kernel directcl_exp_kernel; -static cl_kernel directcl_log_kernel; -static cl_kernel directcl_add_kernel; -static cl_kernel directcl_mul_kernel; -static cl_kernel directcl_dbl_kernel; -static cl_mem directcl_buffer_a; -static cl_mem directcl_buffer_b; -static cl_mem directcl_buffer_c; -static cl_mem directcl_buffer_m; -static cl_mem directcl_buffer_n; -static cl_mem directcl_buffer_o; -static cl_mem directcl_buffer_p; -static cl_mem directcl_buffer_z; - -#define directcl_check(ret) { \ - if((ret) != CL_SUCCESS){ \ - hpx::cout << "directcl.hpp:" << __LINE__ \ - << ": CL ERROR: " << (ret) << hpx::endl; \ - exit(1); \ - } \ - } +static cl_context directcl_context; +static cl_command_queue directcl_command_queue; +static cl_program directcl_program; +static cl_kernel directcl_exp_kernel; +static cl_kernel directcl_log_kernel; +static cl_kernel directcl_add_kernel; +static cl_kernel directcl_mul_kernel; +static cl_kernel directcl_dbl_kernel; +static cl_mem directcl_buffer_a; +static cl_mem directcl_buffer_b; +static cl_mem directcl_buffer_c; +static cl_mem directcl_buffer_m; +static cl_mem directcl_buffer_n; +static cl_mem directcl_buffer_o; +static cl_mem directcl_buffer_p; +static cl_mem directcl_buffer_z; + +#define directcl_check(ret) \ + { \ + if ((ret) != CL_SUCCESS) { \ + hpx::cout << "directcl.hpp:" << __LINE__ << ": CL ERROR: " << (ret) \ + << hpx::endl; \ + exit(1); \ + } \ + } /*static void directcl_check(cl_int ret) { @@ -50,377 +50,339 @@ static cl_mem directcl_buffer_z; }*/ -static cl_device_id directcl_choose_device() -{ - - cl_int ret; - - // get number of platform ids - cl_uint num_platforms; - ret = clGetPlatformIDs(0, NULL, &num_platforms); - directcl_check(ret); - - // get platform ids - std::vector platforms(num_platforms); - ret = clGetPlatformIDs(num_platforms, platforms.data(), NULL); +static cl_device_id directcl_choose_device() { + cl_int ret; + + // get number of platform ids + cl_uint num_platforms; + ret = clGetPlatformIDs(0, NULL, &num_platforms); + directcl_check(ret); + + // get platform ids + std::vector platforms(num_platforms); + ret = clGetPlatformIDs(num_platforms, platforms.data(), NULL); + directcl_check(ret); + + /* + // Print Platforms + hpx::cout << "Platforms:" << hpx::endl; + for(cl_uint i = 0; i < num_platforms; i++) + { + char platformName[100]; + char platformVendor[100]; + + ret = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 100, + platformName, NULL); + directcl_check(ret); + ret = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 100, + platformVendor, NULL); + directcl_check(ret); + + hpx::cout << i << ": " << platformName << " (" << platformVendor << + ")" + << hpx::endl; + } + + // Lets you choose a platform + cl_uint platform_num; + hpx::cout << "Choose platform: " << hpx::endl; + std::cin >> platform_num; + if(platform_num < 0 || platform_num >= num_platforms) + exit(0); + */ + + // Ensure that we found a platforms + if (num_platforms < 1) { + hpx::cout << "No OpenCL platforms found!" << hpx::endl; + exit(1); + } + + // Select the platform + cl_uint num_devices = 0; + cl_platform_id platform = 0; + for (auto& current_platform : platforms) { + // get number of device ids + ret = clGetDeviceIDs(current_platform, CL_DEVICE_TYPE_GPU, 0, NULL, + &num_devices); + if (ret == CL_DEVICE_NOT_FOUND) continue; directcl_check(ret); -/* - // Print Platforms - hpx::cout << "Platforms:" << hpx::endl; - for(cl_uint i = 0; i < num_platforms; i++) - { - char platformName[100]; - char platformVendor[100]; - - ret = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 100, - platformName, NULL); - directcl_check(ret); - ret = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 100, - platformVendor, NULL); - directcl_check(ret); - - hpx::cout << i << ": " << platformName << " (" << platformVendor << ")" - << hpx::endl; - } - - // Lets you choose a platform - cl_uint platform_num; - hpx::cout << "Choose platform: " << hpx::endl; - std::cin >> platform_num; - if(platform_num < 0 || platform_num >= num_platforms) - exit(0); -*/ - - // Ensure that we found a platforms - if(num_platforms < 1) + // Print platform name + hpx::cout << "Platform:" << hpx::endl; { - hpx::cout << "No OpenCL platforms found!" << hpx::endl; - exit(1); + char platformName[100]; + char platformVendor[100]; + + ret = clGetPlatformInfo(current_platform, CL_PLATFORM_NAME, 100, + platformName, NULL); + directcl_check(ret); + ret = clGetPlatformInfo(current_platform, CL_PLATFORM_VENDOR, 100, + platformVendor, NULL); + directcl_check(ret); + + hpx::cout << " " << platformName << " (" << platformVendor << ")" + << hpx::endl; } - // Select the platform - cl_uint num_devices = 0; - cl_platform_id platform = 0; - for(auto & current_platform : platforms){ + platform = current_platform; + break; + } - // get number of device ids - ret = clGetDeviceIDs( current_platform, CL_DEVICE_TYPE_GPU, 0, NULL, - &num_devices); - if(ret == CL_DEVICE_NOT_FOUND) continue; - directcl_check(ret); + // Ensure that we found a platforms + if (num_devices < 1) { + hpx::cout << "No OpenCL devices found!" << hpx::endl; + exit(1); + } + // get device ids + std::vector devices(num_devices); + ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, + devices.data(), NULL); - // Print platform name - hpx::cout << "Platform:" << hpx::endl; - { - char platformName[100]; - char platformVendor[100]; - - ret = clGetPlatformInfo( current_platform, CL_PLATFORM_NAME, 100, - platformName, NULL ); - directcl_check(ret); - ret = clGetPlatformInfo( current_platform, CL_PLATFORM_VENDOR, 100, - platformVendor, NULL ); - directcl_check(ret); - - hpx::cout << " " << platformName << " (" << platformVendor << ")" - << hpx::endl; - } - - platform = current_platform; - break; - - } - - // Ensure that we found a platforms - if(num_devices < 1) - { - hpx::cout << "No OpenCL devices found!" << hpx::endl; - exit(1); - } - - // get device ids - std::vector devices(num_devices); - ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, num_devices, devices.data(), - NULL); - - // Print devices - hpx::cout << "Device:" << hpx::endl; - { - char deviceName[100]; + // Print devices + hpx::cout << "Device:" << hpx::endl; + { + char deviceName[100]; - ret = clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 100, - deviceName, NULL); - directcl_check(ret); - - hpx::cout << " " << deviceName << hpx::endl; - } + ret = clGetDeviceInfo(devices[0], CL_DEVICE_NAME, 100, deviceName, NULL); + directcl_check(ret); - return devices[0]; + hpx::cout << " " << deviceName << hpx::endl; + } + return devices[0]; } -static void directcl_initialize(size_t vector_size) -{ - - cl_device_id device_id = directcl_choose_device(); - - cl_int err; - - // Create context - directcl_context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); - directcl_check(err); - - // Create command queue - directcl_command_queue = clCreateCommandQueue(directcl_context, device_id, - 0, &err); - directcl_check(err); - - // Create program - const char* gpu_code_ptr = gpu_code; - directcl_program = clCreateProgramWithSource( directcl_context, 1, - &gpu_code_ptr, - NULL, &err); - directcl_check(err); - - // Build program - err = clBuildProgram(directcl_program, 1, &device_id, NULL, NULL, NULL); - - // Create kernels - directcl_log_kernel = clCreateKernel(directcl_program, "logn", &err); - directcl_check(err); - directcl_exp_kernel = clCreateKernel(directcl_program, "expn", &err); - directcl_check(err); - directcl_mul_kernel = clCreateKernel(directcl_program, "mul", &err); - directcl_check(err); - directcl_add_kernel = clCreateKernel(directcl_program, "add", &err); - directcl_check(err); - directcl_dbl_kernel = clCreateKernel(directcl_program, "dbl", &err); - directcl_check(err); - - // Create buffers - directcl_buffer_a = clCreateBuffer(directcl_context, - CL_MEM_READ_ONLY, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_b = clCreateBuffer(directcl_context, - CL_MEM_READ_ONLY, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_c = clCreateBuffer(directcl_context, - CL_MEM_READ_ONLY, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_m = clCreateBuffer(directcl_context, - CL_MEM_READ_WRITE, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_n = clCreateBuffer(directcl_context, - CL_MEM_READ_WRITE, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_o = clCreateBuffer(directcl_context, - CL_MEM_READ_WRITE, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_p = clCreateBuffer(directcl_context, - CL_MEM_READ_WRITE, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - directcl_buffer_z = clCreateBuffer(directcl_context, - CL_MEM_WRITE_ONLY, - vector_size * sizeof(float), NULL, &err); - directcl_check(err); - - // set kernel args for exp - err = clSetKernelArg(directcl_exp_kernel, 0, - sizeof(cl_mem), &directcl_buffer_m); - directcl_check(err); - err = clSetKernelArg(directcl_exp_kernel, 1, - sizeof(cl_mem), &directcl_buffer_b); - directcl_check(err); - - // set kernel args for add - err = clSetKernelArg(directcl_add_kernel, 0, - sizeof(cl_mem), &directcl_buffer_n); - directcl_check(err); - err = clSetKernelArg(directcl_add_kernel, 1, - sizeof(cl_mem), &directcl_buffer_a); - directcl_check(err); - err = clSetKernelArg(directcl_add_kernel, 2, - sizeof(cl_mem), &directcl_buffer_m); - directcl_check(err); - - // set kernel args for dbl - err = clSetKernelArg(directcl_dbl_kernel, 0, - sizeof(cl_mem), &directcl_buffer_o); - directcl_check(err); - err = clSetKernelArg(directcl_dbl_kernel, 1, - sizeof(cl_mem), &directcl_buffer_c); - directcl_check(err); - - // set kernel args for mul - err = clSetKernelArg(directcl_mul_kernel, 0, - sizeof(cl_mem), &directcl_buffer_p); - directcl_check(err); - err = clSetKernelArg(directcl_mul_kernel, 1, - sizeof(cl_mem), &directcl_buffer_n); - directcl_check(err); - err = clSetKernelArg(directcl_mul_kernel, 2, - sizeof(cl_mem), &directcl_buffer_o); - directcl_check(err); - - // set kernel args for log - err = clSetKernelArg(directcl_log_kernel, 0, - sizeof(cl_mem), &directcl_buffer_z); - directcl_check(err); - err = clSetKernelArg(directcl_log_kernel, 1, - sizeof(cl_mem), &directcl_buffer_p); - directcl_check(err); - +static void directcl_initialize(size_t vector_size) { + cl_device_id device_id = directcl_choose_device(); + + cl_int err; + + // Create context + directcl_context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &err); + directcl_check(err); + + // Create command queue + directcl_command_queue = + clCreateCommandQueue(directcl_context, device_id, 0, &err); + directcl_check(err); + + // Create program + const char* gpu_code_ptr = gpu_code; + directcl_program = + clCreateProgramWithSource(directcl_context, 1, &gpu_code_ptr, NULL, &err); + directcl_check(err); + + // Build program + err = clBuildProgram(directcl_program, 1, &device_id, NULL, NULL, NULL); + + // Create kernels + directcl_log_kernel = clCreateKernel(directcl_program, "logn", &err); + directcl_check(err); + directcl_exp_kernel = clCreateKernel(directcl_program, "expn", &err); + directcl_check(err); + directcl_mul_kernel = clCreateKernel(directcl_program, "mul", &err); + directcl_check(err); + directcl_add_kernel = clCreateKernel(directcl_program, "add", &err); + directcl_check(err); + directcl_dbl_kernel = clCreateKernel(directcl_program, "dbl", &err); + directcl_check(err); + + // Create buffers + directcl_buffer_a = clCreateBuffer(directcl_context, CL_MEM_READ_ONLY, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_b = clCreateBuffer(directcl_context, CL_MEM_READ_ONLY, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_c = clCreateBuffer(directcl_context, CL_MEM_READ_ONLY, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_m = clCreateBuffer(directcl_context, CL_MEM_READ_WRITE, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_n = clCreateBuffer(directcl_context, CL_MEM_READ_WRITE, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_o = clCreateBuffer(directcl_context, CL_MEM_READ_WRITE, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_p = clCreateBuffer(directcl_context, CL_MEM_READ_WRITE, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + directcl_buffer_z = clCreateBuffer(directcl_context, CL_MEM_WRITE_ONLY, + vector_size * sizeof(float), NULL, &err); + directcl_check(err); + + // set kernel args for exp + err = clSetKernelArg(directcl_exp_kernel, 0, sizeof(cl_mem), + &directcl_buffer_m); + directcl_check(err); + err = clSetKernelArg(directcl_exp_kernel, 1, sizeof(cl_mem), + &directcl_buffer_b); + directcl_check(err); + + // set kernel args for add + err = clSetKernelArg(directcl_add_kernel, 0, sizeof(cl_mem), + &directcl_buffer_n); + directcl_check(err); + err = clSetKernelArg(directcl_add_kernel, 1, sizeof(cl_mem), + &directcl_buffer_a); + directcl_check(err); + err = clSetKernelArg(directcl_add_kernel, 2, sizeof(cl_mem), + &directcl_buffer_m); + directcl_check(err); + + // set kernel args for dbl + err = clSetKernelArg(directcl_dbl_kernel, 0, sizeof(cl_mem), + &directcl_buffer_o); + directcl_check(err); + err = clSetKernelArg(directcl_dbl_kernel, 1, sizeof(cl_mem), + &directcl_buffer_c); + directcl_check(err); + + // set kernel args for mul + err = clSetKernelArg(directcl_mul_kernel, 0, sizeof(cl_mem), + &directcl_buffer_p); + directcl_check(err); + err = clSetKernelArg(directcl_mul_kernel, 1, sizeof(cl_mem), + &directcl_buffer_n); + directcl_check(err); + err = clSetKernelArg(directcl_mul_kernel, 2, sizeof(cl_mem), + &directcl_buffer_o); + directcl_check(err); + + // set kernel args for log + err = clSetKernelArg(directcl_log_kernel, 0, sizeof(cl_mem), + &directcl_buffer_z); + directcl_check(err); + err = clSetKernelArg(directcl_log_kernel, 1, sizeof(cl_mem), + &directcl_buffer_p); + directcl_check(err); } - -static std::shared_ptr> -directcl_calculate(hpx::serialization::serialize_buffer a, - hpx::serialization::serialize_buffer b, - hpx::serialization::serialize_buffer c, - double* t_nonblock, - double* t_total) -{ - - // do nothing if matrices are wrong - if(a.size() != b.size() || b.size() != c.size()) - { - return std::shared_ptr>(); - } - - // initialize error test - cl_int err; - - // copy data to gpu - err = clEnqueueWriteBuffer(directcl_command_queue, directcl_buffer_a, - CL_FALSE, 0, a.size() * sizeof(float), - a.data(), 0, NULL, NULL); - directcl_check(err); - err = clEnqueueWriteBuffer(directcl_command_queue, directcl_buffer_b, - CL_FALSE, 0, a.size() * sizeof(float), - b.data(), 0, NULL, NULL); - directcl_check(err); - err = clEnqueueWriteBuffer(directcl_command_queue, directcl_buffer_c, - CL_FALSE, 0, a.size() * sizeof(float), - c.data(), 0, NULL, NULL); - directcl_check(err); - - - // wait for writes to finish - err = clFinish(directcl_command_queue); - directcl_check(err); - - // start timer - timer_start(); - - - // run kernels - size_t size = a.size(); - err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_exp_kernel, - 1, NULL, &size, NULL, 0, NULL, NULL); - directcl_check(err); - err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_add_kernel, - 1, NULL, &size, NULL, 0, NULL, NULL); - directcl_check(err); - err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_dbl_kernel, - 1, NULL, &size, NULL, 0, NULL, NULL); - directcl_check(err); - err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_mul_kernel, - 1, NULL, &size, NULL, 0, NULL, NULL); - directcl_check(err); - err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_log_kernel, - 1, NULL, &size, NULL, 0, NULL, NULL); - directcl_check(err); - - // get time of nonblocking calls - *t_nonblock = timer_stop(); - - // finish - err = clFinish(directcl_command_queue); - directcl_check(err); - - // get time of total calculation - *t_total = timer_stop(); - - // allocate the result buffer - std::shared_ptr> res(new std::vector(a.size())); - - // read into result buffer - err = clEnqueueReadBuffer(directcl_command_queue, directcl_buffer_z, - CL_FALSE, 0, a.size() * sizeof(float), - res->data(), 0, NULL, NULL); - directcl_check(err); - - // finish - err = clFinish(directcl_command_queue); - directcl_check(err); - - return res; +static std::shared_ptr> directcl_calculate( + hpx::serialization::serialize_buffer a, + hpx::serialization::serialize_buffer b, + hpx::serialization::serialize_buffer c, double* t_nonblock, + double* t_total) { + // do nothing if matrices are wrong + if (a.size() != b.size() || b.size() != c.size()) { + return std::shared_ptr>(); + } + + // initialize error test + cl_int err; + + // copy data to gpu + err = clEnqueueWriteBuffer(directcl_command_queue, directcl_buffer_a, + CL_FALSE, 0, a.size() * sizeof(float), a.data(), 0, + NULL, NULL); + directcl_check(err); + err = clEnqueueWriteBuffer(directcl_command_queue, directcl_buffer_b, + CL_FALSE, 0, a.size() * sizeof(float), b.data(), 0, + NULL, NULL); + directcl_check(err); + err = clEnqueueWriteBuffer(directcl_command_queue, directcl_buffer_c, + CL_FALSE, 0, a.size() * sizeof(float), c.data(), 0, + NULL, NULL); + directcl_check(err); + + // wait for writes to finish + err = clFinish(directcl_command_queue); + directcl_check(err); + + // start timer + timer_start(); + + // run kernels + size_t size = a.size(); + err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_exp_kernel, 1, + NULL, &size, NULL, 0, NULL, NULL); + directcl_check(err); + err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_add_kernel, 1, + NULL, &size, NULL, 0, NULL, NULL); + directcl_check(err); + err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_dbl_kernel, 1, + NULL, &size, NULL, 0, NULL, NULL); + directcl_check(err); + err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_mul_kernel, 1, + NULL, &size, NULL, 0, NULL, NULL); + directcl_check(err); + err = clEnqueueNDRangeKernel(directcl_command_queue, directcl_log_kernel, 1, + NULL, &size, NULL, 0, NULL, NULL); + directcl_check(err); + + // get time of nonblocking calls + *t_nonblock = timer_stop(); + + // finish + err = clFinish(directcl_command_queue); + directcl_check(err); + + // get time of total calculation + *t_total = timer_stop(); + + // allocate the result buffer + std::shared_ptr> res(new std::vector(a.size())); + + // read into result buffer + err = clEnqueueReadBuffer(directcl_command_queue, directcl_buffer_z, CL_FALSE, + 0, a.size() * sizeof(float), res->data(), 0, NULL, + NULL); + directcl_check(err); + + // finish + err = clFinish(directcl_command_queue); + directcl_check(err); + + return res; } - -static void directcl_shutdown() -{ - cl_int err; - - // release buffers - err = clReleaseMemObject(directcl_buffer_a); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_b); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_c); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_m); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_n); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_o); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_p); - directcl_check(err); - err = clReleaseMemObject(directcl_buffer_z); - directcl_check(err); - - // release kernels - err = clReleaseKernel(directcl_dbl_kernel); - directcl_check(err); - err = clReleaseKernel(directcl_add_kernel); - directcl_check(err); - err = clReleaseKernel(directcl_mul_kernel); - directcl_check(err); - err = clReleaseKernel(directcl_exp_kernel); - directcl_check(err); - err = clReleaseKernel(directcl_log_kernel); - directcl_check(err); - - // release program - err = clReleaseProgram(directcl_program); - directcl_check(err); - - // release command queue - err = clReleaseCommandQueue(directcl_command_queue); - directcl_check(err); - - // release context - err = clReleaseContext(directcl_context); - directcl_check(err); - +static void directcl_shutdown() { + cl_int err; + + // release buffers + err = clReleaseMemObject(directcl_buffer_a); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_b); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_c); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_m); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_n); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_o); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_p); + directcl_check(err); + err = clReleaseMemObject(directcl_buffer_z); + directcl_check(err); + + // release kernels + err = clReleaseKernel(directcl_dbl_kernel); + directcl_check(err); + err = clReleaseKernel(directcl_add_kernel); + directcl_check(err); + err = clReleaseKernel(directcl_mul_kernel); + directcl_check(err); + err = clReleaseKernel(directcl_exp_kernel); + directcl_check(err); + err = clReleaseKernel(directcl_log_kernel); + directcl_check(err); + + // release program + err = clReleaseProgram(directcl_program); + directcl_check(err); + + // release command queue + err = clReleaseCommandQueue(directcl_command_queue); + directcl_check(err); + + // release context + err = clReleaseContext(directcl_context); + directcl_check(err); } - - - - - - - -#endif //BENCHMARK_DIRECTCL_HPP_ - +#endif // BENCHMARK_DIRECTCL_HPP_ diff --git a/examples/opencl/benchmark_vector/gpu_code.hpp b/examples/opencl/benchmark_vector/gpu_code.hpp index 94965ad7..7b8d4b02 100644 --- a/examples/opencl/benchmark_vector/gpu_code.hpp +++ b/examples/opencl/benchmark_vector/gpu_code.hpp @@ -6,41 +6,39 @@ #ifndef BENCHMARK_GPU_CODE_H_ #define BENCHMARK_GPU_CODE_H_ - static const char gpu_code[] = -" \n" -" __kernel void logn(__global float* out,__global float* in) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = log(in[tid]); \n" -" } \n" -" \n" -" __kernel void expn(__global float* out,__global float* in) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = exp(in[tid]); \n" -" } \n" -" \n" -" __kernel void add(__global float* out,__global float* in1, \n" -" __global float* in2) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = in1[tid] + in2[tid]; \n" -" } \n" -" \n" -" __kernel void dbl(__global float* out,__global float* in) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = 2 * in[tid]; \n" -" } \n" -" \n" -" __kernel void mul(__global float* out,__global float* in1, \n" -" __global float* in2) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = in1[tid] * in2[tid]; \n" -" } \n" -" \n"; - + " \n" + " __kernel void logn(__global float* out,__global float* in) \n" + " { \n" + " size_t tid = get_global_id(0); \n" + " out[tid] = log(in[tid]); \n" + " } \n" + " \n" + " __kernel void expn(__global float* out,__global float* in) \n" + " { \n" + " size_t tid = get_global_id(0); \n" + " out[tid] = exp(in[tid]); \n" + " } \n" + " \n" + " __kernel void add(__global float* out,__global float* in1, \n" + " __global float* in2) \n" + " { \n" + " size_t tid = get_global_id(0); \n" + " out[tid] = in1[tid] + in2[tid]; \n" + " } \n" + " \n" + " __kernel void dbl(__global float* out,__global float* in) \n" + " { \n" + " size_t tid = get_global_id(0); \n" + " out[tid] = 2 * in[tid]; \n" + " } \n" + " \n" + " __kernel void mul(__global float* out,__global float* in1, \n" + " __global float* in2) \n" + " { \n" + " size_t tid = get_global_id(0); \n" + " out[tid] = in1[tid] * in2[tid]; \n" + " } \n" + " \n"; -#endif //BENCHMARK_GPU_CODE_H_ +#endif // BENCHMARK_GPU_CODE_H_ diff --git a/examples/opencl/benchmark_vector/hpx_helpers.hpp b/examples/opencl/benchmark_vector/hpx_helpers.hpp index 257288ad..11b70f7f 100644 --- a/examples/opencl/benchmark_vector/hpx_helpers.hpp +++ b/examples/opencl/benchmark_vector/hpx_helpers.hpp @@ -8,30 +8,16 @@ #include -static hpx::naming::id_type hpx_get_remote_node() -{ +static hpx::naming::id_type hpx_get_remote_node() { + // Get all HPX localities + std::vector localities = hpx::find_remote_localities(); - // Get all HPX localities - std::vector localities = - hpx::find_remote_localities(); - - if(localities.empty()){ - hpx::cout << "ERROR: No remote node found!" << hpx::endl; - return hpx::naming::id_type(); - } - - return localities[0]; + if (localities.empty()) { + hpx::cout << "ERROR: No remote node found!" << hpx::endl; + return hpx::naming::id_type(); + } + return localities[0]; } - - - - - - - - - -#endif //BENCHMARK_HPX_HELPERS_HPP_ - +#endif // BENCHMARK_HPX_HELPERS_HPP_ diff --git a/examples/opencl/benchmark_vector/hpxcl_single.hpp b/examples/opencl/benchmark_vector/hpxcl_single.hpp index 0b7e625b..8a0c5053 100644 --- a/examples/opencl/benchmark_vector/hpxcl_single.hpp +++ b/examples/opencl/benchmark_vector/hpxcl_single.hpp @@ -12,293 +12,244 @@ using namespace hpx::opencl; using hpx::lcos::shared_future; -static device hpxcl_single_device; -static buffer hpxcl_single_buffer_a; -static buffer hpxcl_single_buffer_b; -static buffer hpxcl_single_buffer_c; -static buffer hpxcl_single_buffer_m; -static buffer hpxcl_single_buffer_n; -static buffer hpxcl_single_buffer_o; -static buffer hpxcl_single_buffer_p; -static buffer hpxcl_single_buffer_z; -static program hpxcl_single_program; -static kernel hpxcl_single_log_kernel; -static kernel hpxcl_single_exp_kernel; -static kernel hpxcl_single_mul_kernel; -static kernel hpxcl_single_add_kernel; -static kernel hpxcl_single_dbl_kernel; - - -static void hpxcl_single_initialize( hpx::naming::id_type node_id, - size_t vector_size) -{ - - // Query all devices on local node - std::vector devices = create_devices( node_id, - CL_DEVICE_TYPE_GPU, - "OpenCL 1.1" ).get(); - -/* - // print devices - hpx::cout << "Devices:" << hpx::endl; - for(cl_uint i = 0; i < devices.size(); i++) - { - - device cldevice = devices[i]; - - // Query name - std::string device_name = - cldevice.get_device_info().get(); - std::string device_vendor = - cldevice.get_device_info().get(); - - hpx::cout << i << ": " << device_name << " (" << device_vendor << ")" - << hpx::endl; - - } - - // Lets you choose a device - size_t device_num; - hpx::cout << "Choose device: " << hpx::endl; - std::cin >> device_num; - if(device_num >= devices.size()) - exit(0); - - // Select a device - hpxcl_single_device = devices[device_num]; -*/ - - size_t device_id = 0; - // print device - hpx::cout << "Device:" << hpx::endl; - { - - device cldevice = devices[device_id]; - - // Query name - std::string device_name = - cldevice.get_device_info().get(); - std::string device_vendor = - cldevice.get_device_info().get(); - - hpx::cout << " " << device_name << " (" << device_vendor << ")" - << hpx::endl; - - } - - // Select a device - hpxcl_single_device = devices[device_id]; - - // Create program - typedef hpx::serialization::serialize_buffer buffer_type; - buffer_type gpu_code_buffer( gpu_code, sizeof(gpu_code), - buffer_type::init_mode::reference ); - hpxcl_single_program = - hpxcl_single_device.create_program_with_source( gpu_code_buffer ); - - // Build program - hpxcl_single_program.build(); - - // Create kernels - hpxcl_single_log_kernel = hpxcl_single_program.create_kernel("logn"); - hpxcl_single_exp_kernel = hpxcl_single_program.create_kernel("expn"); - hpxcl_single_mul_kernel = hpxcl_single_program.create_kernel("mul"); - hpxcl_single_add_kernel = hpxcl_single_program.create_kernel("add"); - hpxcl_single_dbl_kernel = hpxcl_single_program.create_kernel("dbl"); - - // Generate buffers - hpxcl_single_buffer_a = hpxcl_single_device.create_buffer( - CL_MEM_READ_ONLY, - vector_size * sizeof(float)); - hpxcl_single_buffer_b = hpxcl_single_device.create_buffer( - CL_MEM_READ_ONLY, - vector_size * sizeof(float)); - hpxcl_single_buffer_c = hpxcl_single_device.create_buffer( - CL_MEM_READ_ONLY, - vector_size * sizeof(float)); - hpxcl_single_buffer_m = hpxcl_single_device.create_buffer( - CL_MEM_READ_WRITE, - vector_size * sizeof(float)); - hpxcl_single_buffer_n = hpxcl_single_device.create_buffer( - CL_MEM_READ_WRITE, - vector_size * sizeof(float)); - hpxcl_single_buffer_o = hpxcl_single_device.create_buffer( - CL_MEM_READ_WRITE, - vector_size * sizeof(float)); - hpxcl_single_buffer_p = hpxcl_single_device.create_buffer( - CL_MEM_READ_WRITE, - vector_size * sizeof(float)); - hpxcl_single_buffer_z = hpxcl_single_device.create_buffer( - CL_MEM_WRITE_ONLY, - vector_size * sizeof(float)); - - // Initialize a list of future events for asynchronous set_arg calls - std::vector> set_arg_futures; - - // set kernel args for exp - set_arg_futures.push_back( - hpxcl_single_exp_kernel.set_arg_async(0, hpxcl_single_buffer_m)); - set_arg_futures.push_back( - hpxcl_single_exp_kernel.set_arg_async(1, hpxcl_single_buffer_b)); - - // set kernel args for add - set_arg_futures.push_back( - hpxcl_single_add_kernel.set_arg_async(0, hpxcl_single_buffer_n)); - set_arg_futures.push_back( - hpxcl_single_add_kernel.set_arg_async(1, hpxcl_single_buffer_a)); - set_arg_futures.push_back( - hpxcl_single_add_kernel.set_arg_async(2, hpxcl_single_buffer_m)); - - // set kernel args for dbl - set_arg_futures.push_back( - hpxcl_single_dbl_kernel.set_arg_async(0, hpxcl_single_buffer_o)); - set_arg_futures.push_back( - hpxcl_single_dbl_kernel.set_arg_async(1, hpxcl_single_buffer_c)); - - // set kernel args for mul - set_arg_futures.push_back( - hpxcl_single_mul_kernel.set_arg_async(0, hpxcl_single_buffer_p)); - set_arg_futures.push_back( - hpxcl_single_mul_kernel.set_arg_async(1, hpxcl_single_buffer_n)); - set_arg_futures.push_back( - hpxcl_single_mul_kernel.set_arg_async(2, hpxcl_single_buffer_o)); - - // set kernel args for log - set_arg_futures.push_back( - hpxcl_single_log_kernel.set_arg_async(0, hpxcl_single_buffer_z)); - set_arg_futures.push_back( - hpxcl_single_log_kernel.set_arg_async(1, hpxcl_single_buffer_p)); - - // wait for function calls to trigger - hpx::wait_all( set_arg_futures ); - - -} - -static hpx::serialization::serialize_buffer -hpxcl_single_calculate(hpx::serialization::serialize_buffer a, - hpx::serialization::serialize_buffer b, - hpx::serialization::serialize_buffer c, - double* t_nonblock, - double* t_finish) -{ - // do nothing if matrices are wrong - if(a.size() != b.size() || b.size() != c.size()) - { - return hpx::serialization::serialize_buffer(); - } - - size_t size = a.size(); - - // copy data to gpu - auto write_a_event = hpxcl_single_buffer_a.enqueue_write( 0, a ); - auto write_b_event = hpxcl_single_buffer_b.enqueue_write( 0, b ); - auto write_c_event = hpxcl_single_buffer_c.enqueue_write( 0, c ); - - // wait for write to finish - write_a_event.wait(); - write_b_event.wait(); - write_c_event.wait(); - - // start time measurement - timer_start(); - - // set work dimensions - work_size<1> dim; - dim[0].offset = 0; - dim[0].size = size; - - // run exp kernel - auto kernel_exp_event = hpxcl_single_exp_kernel.enqueue(dim, write_b_event); - - // run add kernel - auto kernel_add_event = hpxcl_single_add_kernel.enqueue( dim, - kernel_exp_event, - write_a_event ); - - // run dbl kernel - auto kernel_dbl_event = hpxcl_single_dbl_kernel.enqueue( dim, - write_c_event ); - - // run mul kernel - auto kernel_mul_event = hpxcl_single_mul_kernel.enqueue( dim, - kernel_add_event, - kernel_dbl_event ); - - // run log kernel - auto kernel_log_event = hpxcl_single_log_kernel.enqueue( dim, - kernel_mul_event); - - ////////// UNTIL HERE ALL CALLS WERE NON-BLOCKING ///////////////////////// - - // get time of non-blocking calls - *t_nonblock = timer_stop(); - - // wait for the end of the execution - kernel_log_event.wait(); - - // get total time of execution - *t_finish = timer_stop(); - - // enqueue result read - typedef hpx::serialization::serialize_buffer buffer_type; - buffer_type result_buffer ( size ); - auto read_event = - hpxcl_single_buffer_z.enqueue_read( 0, result_buffer, - kernel_log_event ); - - // wait for calculation to complete and return data - return read_event.get(); - +static device hpxcl_single_device; +static buffer hpxcl_single_buffer_a; +static buffer hpxcl_single_buffer_b; +static buffer hpxcl_single_buffer_c; +static buffer hpxcl_single_buffer_m; +static buffer hpxcl_single_buffer_n; +static buffer hpxcl_single_buffer_o; +static buffer hpxcl_single_buffer_p; +static buffer hpxcl_single_buffer_z; +static program hpxcl_single_program; +static kernel hpxcl_single_log_kernel; +static kernel hpxcl_single_exp_kernel; +static kernel hpxcl_single_mul_kernel; +static kernel hpxcl_single_add_kernel; +static kernel hpxcl_single_dbl_kernel; + +static void hpxcl_single_initialize(hpx::naming::id_type node_id, + size_t vector_size) { + // Query all devices on local node + std::vector devices = + create_devices(node_id, CL_DEVICE_TYPE_GPU, "OpenCL 1.1").get(); + + /* + // print devices + hpx::cout << "Devices:" << hpx::endl; + for(cl_uint i = 0; i < devices.size(); i++) + { + + device cldevice = devices[i]; + + // Query name + std::string device_name = + cldevice.get_device_info().get(); + std::string device_vendor = + cldevice.get_device_info().get(); + + hpx::cout << i << ": " << device_name << " (" << device_vendor << ")" + << hpx::endl; + + } + + // Lets you choose a device + size_t device_num; + hpx::cout << "Choose device: " << hpx::endl; + std::cin >> device_num; + if(device_num >= devices.size()) + exit(0); + + // Select a device + hpxcl_single_device = devices[device_num]; + */ + + size_t device_id = 0; + // print device + hpx::cout << "Device:" << hpx::endl; + { + device cldevice = devices[device_id]; + + // Query name + std::string device_name = cldevice.get_device_info().get(); + std::string device_vendor = + cldevice.get_device_info().get(); + + hpx::cout << " " << device_name << " (" << device_vendor << ")" + << hpx::endl; + } + + // Select a device + hpxcl_single_device = devices[device_id]; + + // Create program + typedef hpx::serialization::serialize_buffer buffer_type; + buffer_type gpu_code_buffer(gpu_code, sizeof(gpu_code), + buffer_type::init_mode::reference); + hpxcl_single_program = + hpxcl_single_device.create_program_with_source(gpu_code_buffer); + + // Build program + hpxcl_single_program.build(); + + // Create kernels + hpxcl_single_log_kernel = hpxcl_single_program.create_kernel("logn"); + hpxcl_single_exp_kernel = hpxcl_single_program.create_kernel("expn"); + hpxcl_single_mul_kernel = hpxcl_single_program.create_kernel("mul"); + hpxcl_single_add_kernel = hpxcl_single_program.create_kernel("add"); + hpxcl_single_dbl_kernel = hpxcl_single_program.create_kernel("dbl"); + + // Generate buffers + hpxcl_single_buffer_a = hpxcl_single_device.create_buffer( + CL_MEM_READ_ONLY, vector_size * sizeof(float)); + hpxcl_single_buffer_b = hpxcl_single_device.create_buffer( + CL_MEM_READ_ONLY, vector_size * sizeof(float)); + hpxcl_single_buffer_c = hpxcl_single_device.create_buffer( + CL_MEM_READ_ONLY, vector_size * sizeof(float)); + hpxcl_single_buffer_m = hpxcl_single_device.create_buffer( + CL_MEM_READ_WRITE, vector_size * sizeof(float)); + hpxcl_single_buffer_n = hpxcl_single_device.create_buffer( + CL_MEM_READ_WRITE, vector_size * sizeof(float)); + hpxcl_single_buffer_o = hpxcl_single_device.create_buffer( + CL_MEM_READ_WRITE, vector_size * sizeof(float)); + hpxcl_single_buffer_p = hpxcl_single_device.create_buffer( + CL_MEM_READ_WRITE, vector_size * sizeof(float)); + hpxcl_single_buffer_z = hpxcl_single_device.create_buffer( + CL_MEM_WRITE_ONLY, vector_size * sizeof(float)); + + // Initialize a list of future events for asynchronous set_arg calls + std::vector> set_arg_futures; + + // set kernel args for exp + set_arg_futures.push_back( + hpxcl_single_exp_kernel.set_arg_async(0, hpxcl_single_buffer_m)); + set_arg_futures.push_back( + hpxcl_single_exp_kernel.set_arg_async(1, hpxcl_single_buffer_b)); + + // set kernel args for add + set_arg_futures.push_back( + hpxcl_single_add_kernel.set_arg_async(0, hpxcl_single_buffer_n)); + set_arg_futures.push_back( + hpxcl_single_add_kernel.set_arg_async(1, hpxcl_single_buffer_a)); + set_arg_futures.push_back( + hpxcl_single_add_kernel.set_arg_async(2, hpxcl_single_buffer_m)); + + // set kernel args for dbl + set_arg_futures.push_back( + hpxcl_single_dbl_kernel.set_arg_async(0, hpxcl_single_buffer_o)); + set_arg_futures.push_back( + hpxcl_single_dbl_kernel.set_arg_async(1, hpxcl_single_buffer_c)); + + // set kernel args for mul + set_arg_futures.push_back( + hpxcl_single_mul_kernel.set_arg_async(0, hpxcl_single_buffer_p)); + set_arg_futures.push_back( + hpxcl_single_mul_kernel.set_arg_async(1, hpxcl_single_buffer_n)); + set_arg_futures.push_back( + hpxcl_single_mul_kernel.set_arg_async(2, hpxcl_single_buffer_o)); + + // set kernel args for log + set_arg_futures.push_back( + hpxcl_single_log_kernel.set_arg_async(0, hpxcl_single_buffer_z)); + set_arg_futures.push_back( + hpxcl_single_log_kernel.set_arg_async(1, hpxcl_single_buffer_p)); + + // wait for function calls to trigger + hpx::wait_all(set_arg_futures); } -static void hpxcl_single_shutdown() -{ - - // release buffers - hpxcl_single_buffer_a = buffer(); - hpxcl_single_buffer_b = buffer(); - hpxcl_single_buffer_c = buffer(); - hpxcl_single_buffer_m = buffer(); - hpxcl_single_buffer_n = buffer(); - hpxcl_single_buffer_o = buffer(); - hpxcl_single_buffer_p = buffer(); - hpxcl_single_buffer_z = buffer(); - - // release kernels - hpxcl_single_dbl_kernel = kernel(); - hpxcl_single_add_kernel = kernel(); - hpxcl_single_mul_kernel = kernel(); - hpxcl_single_exp_kernel = kernel(); - hpxcl_single_log_kernel = kernel(); - - // release program - hpxcl_single_program = program(); - - // delete device - hpxcl_single_device = device(); - -} - - - - +static hpx::serialization::serialize_buffer hpxcl_single_calculate( + hpx::serialization::serialize_buffer a, + hpx::serialization::serialize_buffer b, + hpx::serialization::serialize_buffer c, double* t_nonblock, + double* t_finish) { + // do nothing if matrices are wrong + if (a.size() != b.size() || b.size() != c.size()) { + return hpx::serialization::serialize_buffer(); + } + size_t size = a.size(); + // copy data to gpu + auto write_a_event = hpxcl_single_buffer_a.enqueue_write(0, a); + auto write_b_event = hpxcl_single_buffer_b.enqueue_write(0, b); + auto write_c_event = hpxcl_single_buffer_c.enqueue_write(0, c); + // wait for write to finish + write_a_event.wait(); + write_b_event.wait(); + write_c_event.wait(); + // start time measurement + timer_start(); + // set work dimensions + work_size<1> dim; + dim[0].offset = 0; + dim[0].size = size; + // run exp kernel + auto kernel_exp_event = hpxcl_single_exp_kernel.enqueue(dim, write_b_event); + // run add kernel + auto kernel_add_event = + hpxcl_single_add_kernel.enqueue(dim, kernel_exp_event, write_a_event); + // run dbl kernel + auto kernel_dbl_event = hpxcl_single_dbl_kernel.enqueue(dim, write_c_event); + // run mul kernel + auto kernel_mul_event = + hpxcl_single_mul_kernel.enqueue(dim, kernel_add_event, kernel_dbl_event); + // run log kernel + auto kernel_log_event = + hpxcl_single_log_kernel.enqueue(dim, kernel_mul_event); + ////////// UNTIL HERE ALL CALLS WERE NON-BLOCKING ///////////////////////// + // get time of non-blocking calls + *t_nonblock = timer_stop(); + // wait for the end of the execution + kernel_log_event.wait(); + // get total time of execution + *t_finish = timer_stop(); + // enqueue result read + typedef hpx::serialization::serialize_buffer buffer_type; + buffer_type result_buffer(size); + auto read_event = + hpxcl_single_buffer_z.enqueue_read(0, result_buffer, kernel_log_event); + // wait for calculation to complete and return data + return read_event.get(); +} -#endif //BENCHMARK_HPXCL_SINGLE_HPP_ +static void hpxcl_single_shutdown() { + // release buffers + hpxcl_single_buffer_a = buffer(); + hpxcl_single_buffer_b = buffer(); + hpxcl_single_buffer_c = buffer(); + hpxcl_single_buffer_m = buffer(); + hpxcl_single_buffer_n = buffer(); + hpxcl_single_buffer_o = buffer(); + hpxcl_single_buffer_p = buffer(); + hpxcl_single_buffer_z = buffer(); + + // release kernels + hpxcl_single_dbl_kernel = kernel(); + hpxcl_single_add_kernel = kernel(); + hpxcl_single_mul_kernel = kernel(); + hpxcl_single_exp_kernel = kernel(); + hpxcl_single_log_kernel = kernel(); + + // release program + hpxcl_single_program = program(); + + // delete device + hpxcl_single_device = device(); +} +#endif // BENCHMARK_HPXCL_SINGLE_HPP_ diff --git a/examples/opencl/benchmark_vector/main.cpp b/examples/opencl/benchmark_vector/main.cpp index 7f4b2109..6f0b67ff 100644 --- a/examples/opencl/benchmark_vector/main.cpp +++ b/examples/opencl/benchmark_vector/main.cpp @@ -14,178 +14,157 @@ #include -int main(int argc, char* argv[]) -{ - - // Print help message on wrong argument count - if(argc < 2) - { - hpx::cerr << "Usage: " << argv[0] << " matrixsize" << hpx::endl; - return hpx::finalize(); - } - - - { - - //////////////////////////////////////////// - // Initializes all matrices - // - size_t vector_size = std::stoul(argv[1]); - hpx::cout << "Vector size: " << vector_size << std::endl; - - hpx::cout << "Generating matrix A ..." << hpx::endl; - auto a = generate_input_matrix(vector_size); - hpx::cout << "Generating matrix B ..." << hpx::endl; - auto b = generate_input_matrix(vector_size); - hpx::cout << "Generating matrix C ..." << hpx::endl; - auto c = generate_input_matrix(vector_size); - - hpx::cout << "Calculating reference result on CPU ..." << hpx::endl; - double time_cpu; - auto z = calculate_result(a,b,c,&time_cpu); - hpx::cout << " ... " << time_cpu << " ms" << hpx::endl; - - - //////////////////////////////////////////// - // Direct OpenCL calculation - // - hpx::cout << hpx::endl; - hpx::cout << "///////////////////////////////////////" << hpx::endl; - hpx::cout << "// Direct OpenCL" << hpx::endl; - hpx::cout << "//" << hpx::endl; - - // initializes - hpx::cout << "Initializing ..." << hpx::endl; - directcl_initialize(vector_size); - - // main calculation with benchmark - double time_directcl_nonblock; - double time_directcl_total; - hpx::cout << "Running calculation ..." << hpx::endl; - std::shared_ptr> z_directcl = - directcl_calculate(a, b, c, - &time_directcl_nonblock, - &time_directcl_total); - - // shuts down - hpx::cout << "Shutting down ..." << hpx::endl; - directcl_shutdown(); - - // checks for correct result - check_for_correct_result(z_directcl->data(), (*z_directcl).size(), - z.data(), z.size()); - - // Prints the benchmark statistics - hpx::cout << hpx::endl; - hpx::cout << " Nonblocking calls: " << time_directcl_nonblock - << " ms" << hpx::endl; - hpx::cout << " Total Calculation Time: " << time_directcl_total - << " ms" << hpx::endl; - hpx::cout << hpx::endl; - - - //////////////////////////////////////////// - // HPXCL local calculation - // - hpx::cout << hpx::endl; - hpx::cout << "///////////////////////////////////////" << hpx::endl; - hpx::cout << "// HPXCL local" << hpx::endl; - hpx::cout << "//" << hpx::endl; - - // initializes - hpx::cout << "Initializing ..." << hpx::endl; - hpxcl_single_initialize(hpx::find_here(), vector_size); - - // main calculation with benchmark - hpx::cout << "Running calculation ..." << hpx::endl; - double time_hpxcl_local_nonblock; - double time_hpxcl_local_total; - auto z_hpxcl_local = hpxcl_single_calculate(a, b, c, - &time_hpxcl_local_nonblock, - &time_hpxcl_local_total); - - // shuts down - hpx::cout << "Shutting down ..." << hpx::endl; - hpxcl_single_shutdown(); - - // checks for correct result - check_for_correct_result( z_hpxcl_local.data(), - z_hpxcl_local.size(), - z.data(), z.size()); - - // Prints the benchmark statistics - hpx::cout << hpx::endl; - hpx::cout << " Nonblocking calls: " << time_hpxcl_local_nonblock - << " ms" << hpx::endl; - hpx::cout << " Total Calculation Time: " << time_hpxcl_local_total - << " ms" << hpx::endl; - hpx::cout << hpx::endl; - - - - //////////////////////////////////////////// - // HPXCL remote calculation - // - hpx::cout << hpx::endl; - hpx::cout << "///////////////////////////////////////" << hpx::endl; - hpx::cout << "// HPXCL remote" << hpx::endl; - hpx::cout << "//" << hpx::endl; - - // initializes - hpx::cout << "Initializing ..." << hpx::endl; - hpx::id_type remote_node = hpx_get_remote_node(); - if(remote_node){ - hpxcl_single_initialize(remote_node, vector_size); - - // main calculation with benchmark - hpx::cout << "Running calculation ..." << hpx::endl; - double time_hpxcl_remote_nonblock; - double time_hpxcl_remote_total; - auto z_hpxcl_remote = hpxcl_single_calculate(a, b, c, - &time_hpxcl_remote_nonblock, - &time_hpxcl_remote_total); - - // shuts down - hpx::cout << "Shutting down ..." << hpx::endl; - hpxcl_single_shutdown(); - - // checks for correct result - check_for_correct_result( z_hpxcl_remote.data(), - z_hpxcl_remote.size(), - z.data(), z.size()); - - // Prints the benchmark statistics - hpx::cout << hpx::endl; - hpx::cout << " Nonblocking calls: " << time_hpxcl_remote_nonblock - << " ms" << hpx::endl; - hpx::cout << " Total Calculation Time: " << time_hpxcl_remote_total - << " ms" << hpx::endl; - hpx::cout << hpx::endl; - } - - - //////////////////////////////////////////// - // HPXCL distributed calculation - // - hpx::cout << hpx::endl; - //hpx::cout << "///////////////////////////////////////" << hpx::endl; - //hpx::cout << "// HPXCL distributed" << hpx::endl; - //hpx::cout << "//" << hpx::endl; - - - - /////////////////////////////////////////// - // Shutdown - // - hpx::cout << hpx::endl; - hpx::cout << "Shutting down hpx ... " << hpx::endl; - - } - - hpx::cout << "Program finished." << hpx::endl; - - // End the program +int main(int argc, char* argv[]) { + // Print help message on wrong argument count + if (argc < 2) { + hpx::cerr << "Usage: " << argv[0] << " matrixsize" << hpx::endl; return hpx::finalize(); + } + + { + //////////////////////////////////////////// + // Initializes all matrices + // + size_t vector_size = std::stoul(argv[1]); + hpx::cout << "Vector size: " << vector_size << std::endl; + + hpx::cout << "Generating matrix A ..." << hpx::endl; + auto a = generate_input_matrix(vector_size); + hpx::cout << "Generating matrix B ..." << hpx::endl; + auto b = generate_input_matrix(vector_size); + hpx::cout << "Generating matrix C ..." << hpx::endl; + auto c = generate_input_matrix(vector_size); + + hpx::cout << "Calculating reference result on CPU ..." << hpx::endl; + double time_cpu; + auto z = calculate_result(a, b, c, &time_cpu); + hpx::cout << " ... " << time_cpu << " ms" << hpx::endl; + + //////////////////////////////////////////// + // Direct OpenCL calculation + // + hpx::cout << hpx::endl; + hpx::cout << "///////////////////////////////////////" << hpx::endl; + hpx::cout << "// Direct OpenCL" << hpx::endl; + hpx::cout << "//" << hpx::endl; + + // initializes + hpx::cout << "Initializing ..." << hpx::endl; + directcl_initialize(vector_size); + + // main calculation with benchmark + double time_directcl_nonblock; + double time_directcl_total; + hpx::cout << "Running calculation ..." << hpx::endl; + std::shared_ptr> z_directcl = directcl_calculate( + a, b, c, &time_directcl_nonblock, &time_directcl_total); + + // shuts down + hpx::cout << "Shutting down ..." << hpx::endl; + directcl_shutdown(); + + // checks for correct result + check_for_correct_result(z_directcl->data(), (*z_directcl).size(), z.data(), + z.size()); + + // Prints the benchmark statistics + hpx::cout << hpx::endl; + hpx::cout << " Nonblocking calls: " << time_directcl_nonblock + << " ms" << hpx::endl; + hpx::cout << " Total Calculation Time: " << time_directcl_total << " ms" + << hpx::endl; + hpx::cout << hpx::endl; + + //////////////////////////////////////////// + // HPXCL local calculation + // + hpx::cout << hpx::endl; + hpx::cout << "///////////////////////////////////////" << hpx::endl; + hpx::cout << "// HPXCL local" << hpx::endl; + hpx::cout << "//" << hpx::endl; + + // initializes + hpx::cout << "Initializing ..." << hpx::endl; + hpxcl_single_initialize(hpx::find_here(), vector_size); + + // main calculation with benchmark + hpx::cout << "Running calculation ..." << hpx::endl; + double time_hpxcl_local_nonblock; + double time_hpxcl_local_total; + auto z_hpxcl_local = hpxcl_single_calculate( + a, b, c, &time_hpxcl_local_nonblock, &time_hpxcl_local_total); + + // shuts down + hpx::cout << "Shutting down ..." << hpx::endl; + hpxcl_single_shutdown(); + + // checks for correct result + check_for_correct_result(z_hpxcl_local.data(), z_hpxcl_local.size(), + z.data(), z.size()); + + // Prints the benchmark statistics + hpx::cout << hpx::endl; + hpx::cout << " Nonblocking calls: " << time_hpxcl_local_nonblock + << " ms" << hpx::endl; + hpx::cout << " Total Calculation Time: " << time_hpxcl_local_total + << " ms" << hpx::endl; + hpx::cout << hpx::endl; + + //////////////////////////////////////////// + // HPXCL remote calculation + // + hpx::cout << hpx::endl; + hpx::cout << "///////////////////////////////////////" << hpx::endl; + hpx::cout << "// HPXCL remote" << hpx::endl; + hpx::cout << "//" << hpx::endl; + + // initializes + hpx::cout << "Initializing ..." << hpx::endl; + hpx::id_type remote_node = hpx_get_remote_node(); + if (remote_node) { + hpxcl_single_initialize(remote_node, vector_size); + + // main calculation with benchmark + hpx::cout << "Running calculation ..." << hpx::endl; + double time_hpxcl_remote_nonblock; + double time_hpxcl_remote_total; + auto z_hpxcl_remote = hpxcl_single_calculate( + a, b, c, &time_hpxcl_remote_nonblock, &time_hpxcl_remote_total); + + // shuts down + hpx::cout << "Shutting down ..." << hpx::endl; + hpxcl_single_shutdown(); + + // checks for correct result + check_for_correct_result(z_hpxcl_remote.data(), z_hpxcl_remote.size(), + z.data(), z.size()); + + // Prints the benchmark statistics + hpx::cout << hpx::endl; + hpx::cout << " Nonblocking calls: " << time_hpxcl_remote_nonblock + << " ms" << hpx::endl; + hpx::cout << " Total Calculation Time: " << time_hpxcl_remote_total + << " ms" << hpx::endl; + hpx::cout << hpx::endl; + } + //////////////////////////////////////////// + // HPXCL distributed calculation + // + hpx::cout << hpx::endl; + // hpx::cout << "///////////////////////////////////////" << hpx::endl; + // hpx::cout << "// HPXCL distributed" << hpx::endl; + // hpx::cout << "//" << hpx::endl; + + /////////////////////////////////////////// + // Shutdown + // + hpx::cout << hpx::endl; + hpx::cout << "Shutting down hpx ... " << hpx::endl; + } + + hpx::cout << "Program finished." << hpx::endl; + + // End the program + return hpx::finalize(); } - diff --git a/examples/opencl/benchmark_vector/matrix_generators.hpp b/examples/opencl/benchmark_vector/matrix_generators.hpp index 204e7bde..e87d7444 100644 --- a/examples/opencl/benchmark_vector/matrix_generators.hpp +++ b/examples/opencl/benchmark_vector/matrix_generators.hpp @@ -17,106 +17,83 @@ static bool random_initialized = false; // generates a random float array of given size. // generated values will be >= 0.5f and <= 1.5f -static -hpx::serialization::serialize_buffer -generate_input_matrix(size_t size) -{ - - // initialize random generator if necessary - if(!random_initialized) - { - std::srand((unsigned)std::time(nullptr)); - random_initialized = true; - } - - // allocate output matrix - typedef hpx::serialization::serialize_buffer buffer_type; - buffer_type ret( size ); - - // fill output matrix - for(size_t i = 0; i < size; i++) - { - ret[i] = 0.5f * ((float) rand()) / (float) RAND_MAX; - } - - // return the generated matrix - return ret; - +static hpx::serialization::serialize_buffer generate_input_matrix( + size_t size) { + // initialize random generator if necessary + if (!random_initialized) { + std::srand((unsigned)std::time(nullptr)); + random_initialized = true; + } + + // allocate output matrix + typedef hpx::serialization::serialize_buffer buffer_type; + buffer_type ret(size); + + // fill output matrix + for (size_t i = 0; i < size; i++) { + ret[i] = 0.5f * ((float)rand()) / (float)RAND_MAX; + } + + // return the generated matrix + return ret; } // calculates the result for verification -static -hpx::serialization::serialize_buffer -calculate_result( hpx::serialization::serialize_buffer a, - hpx::serialization::serialize_buffer b, - hpx::serialization::serialize_buffer c, - double* time) -{ - - - // check for identical vector size - if(a.size() != b.size() || b.size() != c.size()) - exit(1); - - // get vector size - size_t size = a.size(); - - // allocate output matrix - typedef hpx::serialization::serialize_buffer buffer_type; - buffer_type res( size ); - for(size_t i = 0; i < size; i++) - { - res[i] = 0.0f; - } - - // start time measurement - timer_start(); - - // calculate output matrix - for(size_t i = 0; i < size; i++) - { - //res[i] = (a[i] + b[i]) * (2 * c[i]); - res[i] = log((a[i] + exp(b[i])) * (2.0f * c[i])); - } - - // stop time measurement - *time = timer_stop(); - - // return the calculated matrix - return res; - +static hpx::serialization::serialize_buffer calculate_result( + hpx::serialization::serialize_buffer a, + hpx::serialization::serialize_buffer b, + hpx::serialization::serialize_buffer c, double* time) { + // check for identical vector size + if (a.size() != b.size() || b.size() != c.size()) exit(1); + + // get vector size + size_t size = a.size(); + + // allocate output matrix + typedef hpx::serialization::serialize_buffer buffer_type; + buffer_type res(size); + for (size_t i = 0; i < size; i++) { + res[i] = 0.0f; + } + + // start time measurement + timer_start(); + + // calculate output matrix + for (size_t i = 0; i < size; i++) { + // res[i] = (a[i] + b[i]) * (2 * c[i]); + res[i] = log((a[i] + exp(b[i])) * (2.0f * c[i])); + } + + // stop time measurement + *time = timer_stop(); + + // return the calculated matrix + return res; } // verifies the result -static bool check_for_correct_result( float* res, size_t res_size, - float* comp, size_t comp_size) -{ - - hpx::cout << "Verifying result ... " << hpx::endl; - - // Checks for matching array sizes - if(res_size != comp_size) - { - hpx::cout << "Result is incorrect. Sizes don't even match." << hpx::endl; - return false; +static bool check_for_correct_result(float* res, size_t res_size, float* comp, + size_t comp_size) { + hpx::cout << "Verifying result ... " << hpx::endl; + + // Checks for matching array sizes + if (res_size != comp_size) { + hpx::cout << "Result is incorrect. Sizes don't even match." << hpx::endl; + return false; + } + + // Compares every value + for (size_t i = 0; i < res_size; i++) { + if ((res[i] - comp[i]) > 0.00001f) { + hpx::cout << "Result is incorrect. (at pos " << i << ")" << hpx::endl; + return false; } + } - // Compares every value - for(size_t i = 0; i < res_size; i++) - { - if((res[i] - comp[i]) > 0.00001f) - { - hpx::cout << "Result is incorrect. (at pos " << i << ")" << hpx::endl; - return false; - } - } - - // Returns true, arrays are identical. - hpx::cout << "Result is correct." << hpx::endl; - return true; - + // Returns true, arrays are identical. + hpx::cout << "Result is correct." << hpx::endl; + return true; } - - #endif diff --git a/examples/opencl/benchmark_vector/timer.hpp b/examples/opencl/benchmark_vector/timer.hpp index 8e63bbab..1bc928b8 100644 --- a/examples/opencl/benchmark_vector/timer.hpp +++ b/examples/opencl/benchmark_vector/timer.hpp @@ -10,32 +10,20 @@ static boost::posix_time::ptime start_time; -static void timer_start() -{ - - // Measure start time - start_time = boost::posix_time::microsec_clock::local_time(); - +static void timer_start() { + // Measure start time + start_time = boost::posix_time::microsec_clock::local_time(); } -static double timer_stop() -{ - - // Measure stop time - boost::posix_time::ptime stop_time = - boost::posix_time::microsec_clock::local_time(); - - // Calculate difference - boost::posix_time::time_duration diff = stop_time - start_time; +static double timer_stop() { + // Measure stop time + boost::posix_time::ptime stop_time = + boost::posix_time::microsec_clock::local_time(); - return diff.total_microseconds() / 1000.0; + // Calculate difference + boost::posix_time::time_duration diff = stop_time - start_time; + return diff.total_microseconds() / 1000.0; } - - - - -#endif // BENCHMARK_TIMER_H_ - - +#endif // BENCHMARK_TIMER_H_ diff --git a/examples/opencl/hello_world.cpp b/examples/opencl/hello_world.cpp index 61d9802c..e88d215d 100644 --- a/examples/opencl/hello_world.cpp +++ b/examples/opencl/hello_world.cpp @@ -11,72 +11,75 @@ using namespace hpx::opencl; -static const char hello_world_src_str[] = -" \n" -" __kernel void hello_world(__global char * out) \n" -" { \n" -" char in [] = \"Hello World!\"; \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = in[tid]; \n" -" } \n" -" \n"; +static const char hello_world_src_str[] = + " " + "\n" + " __kernel void hello_world(__global char * out) " + "\n" + " { " + "\n" + " char in [] = \"Hello World!\"; " + "\n" + " size_t tid = get_global_id(0); " + "\n" + " out[tid] = in[tid]; " + "\n" + " } " + "\n" + " " + "\n"; typedef hpx::serialization::serialize_buffer buffer_type; -static buffer_type hello_world_src( hello_world_src_str, - sizeof(hello_world_src_str), - buffer_type::init_mode::reference ); - +static buffer_type hello_world_src(hello_world_src_str, + sizeof(hello_world_src_str), + buffer_type::init_mode::reference); // hpx_main, is the actual main called by hpx -int main(int argc, char* argv[]) -{ +int main(int argc, char* argv[]) { + // Get list of available OpenCL Devices. + std::vector devices = + create_all_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1").get(); - // Get list of available OpenCL Devices. - std::vector devices = create_all_devices(CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No OpenCL devices found!" << hpx::endl; + return hpx::finalize(); + } - // Check whether there are any devices - if(devices.size() < 1) - { - hpx::cerr << "No OpenCL devices found!" << hpx::endl; - return hpx::finalize(); - } + // Create a device component from the first device found + device cldevice = devices[0]; - // Create a device component from the first device found - device cldevice = devices[0]; + // Create a buffer + buffer outbuffer = cldevice.create_buffer(CL_MEM_WRITE_ONLY, 13); - // Create a buffer - buffer outbuffer = cldevice.create_buffer(CL_MEM_WRITE_ONLY, 13); + // Create the hello_world device program + program prog = cldevice.create_program_with_source(hello_world_src); - // Create the hello_world device program - program prog = cldevice.create_program_with_source(hello_world_src); + // Compile the program + prog.build(); - // Compile the program - prog.build(); + // Create hello_world kernel + kernel hello_world_kernel = prog.create_kernel("hello_world"); - // Create hello_world kernel - kernel hello_world_kernel = prog.create_kernel("hello_world"); + // Set our buffer as argument + hello_world_kernel.set_arg(0, outbuffer); - // Set our buffer as argument - hello_world_kernel.set_arg(0, outbuffer); + // Run the kernel + hpx::opencl::work_size<1> dim; + dim[0].offset = 0; + dim[0].size = 13; + hpx::future kernel_future = hello_world_kernel.enqueue(dim); - // Run the kernel - hpx::opencl::work_size<1> dim; - dim[0].offset = 0; - dim[0].size = 13; - hpx::future kernel_future = hello_world_kernel.enqueue(dim); + // Start reading the buffer ( With kernel_future as dependency. + // All hpxcl enqueue calls are nonblocking. ) + auto read_future = outbuffer.enqueue_read(0, 13, kernel_future); - // Start reading the buffer ( With kernel_future as dependency. - // All hpxcl enqueue calls are nonblocking. ) - auto read_future = outbuffer.enqueue_read(0, 13, kernel_future); + // Wait for the data to arrive + auto data = read_future.get(); - // Wait for the data to arrive - auto data = read_future.get(); + // Write the data to hpx::cout + hpx::cout << data.data() << hpx::endl; - // Write the data to hpx::cout - hpx::cout << data.data() << hpx::endl; - - return 0; + return 0; } - diff --git a/examples/opencl/list_devices.cpp b/examples/opencl/list_devices.cpp index 1da7f262..d1c9d4cc 100644 --- a/examples/opencl/list_devices.cpp +++ b/examples/opencl/list_devices.cpp @@ -14,149 +14,119 @@ using namespace hpx::opencl; static void printinfo(size_t i, size_t j, std::string info_type, - std::string info) -{ - - hpx::cout << " " << i << "." << j << ". " - << info_type << ": " << info << hpx::endl; - -} - -static std::string -device_uint_to_string(cl_uint res) -{ - std::stringstream ss; - ss << res; - return ss.str(); + std::string info) { + hpx::cout << " " << i << "." << j << ". " << info_type << ": " << info + << hpx::endl; } -static std::string -device_type_to_string(cl_device_type type) -{ - std::vector typelist; +static std::string device_uint_to_string(cl_uint res) { + std::stringstream ss; + ss << res; + return ss.str(); +} +static std::string device_type_to_string(cl_device_type type) { + std::vector typelist; - if(type & CL_DEVICE_TYPE_CPU) - typelist.push_back("cpu"); + if (type & CL_DEVICE_TYPE_CPU) typelist.push_back("cpu"); - if(type & CL_DEVICE_TYPE_GPU) - typelist.push_back("gpu"); + if (type & CL_DEVICE_TYPE_GPU) typelist.push_back("gpu"); - if(type & CL_DEVICE_TYPE_ACCELERATOR) - typelist.push_back("accelerator"); + if (type & CL_DEVICE_TYPE_ACCELERATOR) typelist.push_back("accelerator"); - if(type & CL_DEVICE_TYPE_DEFAULT) - typelist.push_back("default"); + if (type & CL_DEVICE_TYPE_DEFAULT) typelist.push_back("default"); #ifdef CL_VERSION_1_2 - if(type & CL_DEVICE_TYPE_CUSTOM) - typelist.push_back("custom"); + if (type & CL_DEVICE_TYPE_CUSTOM) typelist.push_back("custom"); #endif - std::string result = ""; + std::string result = ""; - for(size_t i = 0 ; i < typelist.size(); i++) - { + for (size_t i = 0; i < typelist.size(); i++) { + if (i > 0) result += ", "; - if(i > 0) - result += ", "; + result += typelist[i]; + } - result += typelist[i]; + return result; +} +// hpx_main, is the actual main called by hpx +int hpx_main(int argc, char* argv[]) { + { + // Get list of available OpenCL Devices. + std::vector devices = + create_all_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.0").get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No OpenCL devices found!" << hpx::endl; + return hpx::finalize(); } - return result; + hpx::cout << hpx::endl << "Devices:" << hpx::endl << hpx::endl; -} + // print a lot of information about every device + size_t i = 1; + for (auto& cldevice : devices) { + size_t j = 1; -// hpx_main, is the actual main called by hpx -int hpx_main(int argc, char* argv[]) -{ - { - - // Get list of available OpenCL Devices. - std::vector devices = create_all_devices(CL_DEVICE_TYPE_ALL, - "OpenCL 1.0" ).get(); - - // Check whether there are any devices - if(devices.size() < 1) - { - hpx::cerr << "No OpenCL devices found!" << hpx::endl; - return hpx::finalize(); - } - - hpx::cout << hpx::endl << "Devices:" << hpx::endl << hpx::endl; - - // print a lot of information about every device - size_t i = 1; - for(auto &cldevice : devices) - { - - size_t j = 1; - - // generate string - std::string str; - - // print name - str = cldevice.get_device_info().get(); - hpx::cout << i << ". " << str << hpx::endl; - - // print platform name - str = cldevice.get_platform_info().get(); - printinfo(i, j++, "Platform", str); - - // print supported opencl version - str = cldevice.get_device_info().get(); - printinfo(i, j++, "OpenCL Version", str); - - // print device type - str = device_type_to_string( - cldevice.get_device_info().get()); - printinfo(i, j++, "Type", str); - - // print driver version - str = cldevice.get_device_info().get(); - printinfo(i, j++, "Driver Version", str); - - // print vendor - str = device_uint_to_string( - cldevice.get_device_info().get()); - str += " - "; - str += cldevice.get_device_info().get(); - printinfo(i, j++, "Vendor", str); - - // print profile - str = cldevice.get_device_info().get(); - printinfo(i, j++, "Profile", str); - - // print compiler c version - str = cldevice.get_device_info().get(); - printinfo(i, j++, "Compiler Version", str); - - /*** TO BE CONTINUED ***/ - - - // add newline before starting a new device - hpx::cout << hpx::endl; - - i++; - } - - + // generate string + std::string str; - } - - // End the program - return hpx::finalize(); -} + // print name + str = cldevice.get_device_info().get(); + hpx::cout << i << ". " << str << hpx::endl; -// Main, initializes HPX -int main(int argc, char* argv[]){ + // print platform name + str = cldevice.get_platform_info().get(); + printinfo(i, j++, "Platform", str); - // initialize HPX, run hpx_main - hpx::start(argc, argv); + // print supported opencl version + str = cldevice.get_device_info().get(); + printinfo(i, j++, "OpenCL Version", str); - // wait for hpx::finalize being called - return hpx::stop(); + // print device type + str = device_type_to_string( + cldevice.get_device_info().get()); + printinfo(i, j++, "Type", str); + + // print driver version + str = cldevice.get_device_info().get(); + printinfo(i, j++, "Driver Version", str); + + // print vendor + str = device_uint_to_string( + cldevice.get_device_info().get()); + str += " - "; + str += cldevice.get_device_info().get(); + printinfo(i, j++, "Vendor", str); + + // print profile + str = cldevice.get_device_info().get(); + printinfo(i, j++, "Profile", str); + + // print compiler c version + str = cldevice.get_device_info().get(); + printinfo(i, j++, "Compiler Version", str); + + /*** TO BE CONTINUED ***/ + + // add newline before starting a new device + hpx::cout << hpx::endl; + + i++; + } + } + + // End the program + return hpx::finalize(); } +// Main, initializes HPX +int main(int argc, char* argv[]) { + // initialize HPX, run hpx_main + hpx::start(argc, argv); + // wait for hpx::finalize being called + return hpx::stop(); +} diff --git a/examples/opencl/mandelbrot/fifo.hpp b/examples/opencl/mandelbrot/fifo.hpp index 330804b9..10022667 100644 --- a/examples/opencl/mandelbrot/fifo.hpp +++ b/examples/opencl/mandelbrot/fifo.hpp @@ -13,112 +13,91 @@ #include template -class fifo -{ - - typedef hpx::lcos::local::spinlock lock_type; - typedef hpx::lcos::local::condition_variable_any cond_type; - -public: - // push an item to the queue - void push(const T &); - // take an item from the queue, will return false on end-of-program. - // blocks. - bool pop(T*); - // signal end of program - void finish(); - -public: - fifo(); - ~fifo(); - -private: - std::queue queue; - lock_type lock; - cond_type cond_var; - - volatile bool finished; - - +class fifo { + typedef hpx::lcos::local::spinlock lock_type; + typedef hpx::lcos::local::condition_variable_any cond_type; + + public: + // push an item to the queue + void push(const T &); + // take an item from the queue, will return false on end-of-program. + // blocks. + bool pop(T *); + // signal end of program + void finish(); + + public: + fifo(); + ~fifo(); + + private: + std::queue queue; + lock_type lock; + cond_type cond_var; + + volatile bool finished; }; -template -fifo::fifo() -{ - finished = false; +template +fifo::fifo() { + finished = false; } -template -fifo::~fifo() -{ - finish(); +template +fifo::~fifo() { + finish(); } -template -void fifo::push(const T &item) -{ - - // lock class - std::unique_lock l(lock); - - // check whether fifo is already in finished state - if(finished) - { - HPX_THROW_EXCEPTION(hpx::invalid_status, "fifo::push()", - "fifo::finish() already called!"); - } +template +void fifo::push(const T &item) { + // lock class + std::unique_lock l(lock); - // push item - queue.push(item); + // check whether fifo is already in finished state + if (finished) { + HPX_THROW_EXCEPTION(hpx::invalid_status, "fifo::push()", + "fifo::finish() already called!"); + } - // signal waiting threads that new item is available - cond_var.notify_one(); + // push item + queue.push(item); + // signal waiting threads that new item is available + cond_var.notify_one(); } -template -bool fifo::pop(T* item) -{ - - // lock class - std::unique_lock l(lock); - - // wait for queue to not be empty - while(queue.empty()) - { - - // check whether fifo is already in finished state - if(finished) - return false; - - // wait for something to change - cond_var.wait(l); +template +bool fifo::pop(T *item) { + // lock class + std::unique_lock l(lock); - } + // wait for queue to not be empty + while (queue.empty()) { + // check whether fifo is already in finished state + if (finished) return false; - // Retrieve element from queue - *item = queue.front(); + // wait for something to change + cond_var.wait(l); + } - // Remove element from queue - queue.pop(); + // Retrieve element from queue + *item = queue.front(); - // Return success - return true; + // Remove element from queue + queue.pop(); + // Return success + return true; } -template -void fifo::finish() -{ - - // lock class - std::unique_lock l(lock); - - finished = true; +template +void fifo::finish() { + // lock class + std::unique_lock l(lock); - cond_var.notify_all(); + finished = true; + cond_var.notify_all(); } #endif - diff --git a/examples/opencl/mandelbrot/image_generator.cpp b/examples/opencl/mandelbrot/image_generator.cpp index 2c50a047..e37bc8e7 100644 --- a/examples/opencl/mandelbrot/image_generator.cpp +++ b/examples/opencl/mandelbrot/image_generator.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "image_generator.hpp" #include @@ -14,396 +13,307 @@ #include #include -image_generator:: -image_generator(size_t img_size_hint_x_, - size_t img_size_hint_y_, - size_t num_parallel_kernels, - bool verbose_, - std::vector devices) - : next_image_id(0), verbose(verbose_), - img_size_hint_x(img_size_hint_x_), - img_size_hint_y(img_size_hint_y_) -{ - - // one retrieve worker for every os thread - size_t num_retrieve_workers = hpx::get_os_thread_count(); - - // create workqueue - workqueue = std::make_shared - > >(); - - // initialize worker list - workers = std::make_shared - > >(); - - // starting workers - for( auto& device : devices) - { - - // add a worker - add_worker(device, num_parallel_kernels); - - } - - // starting retrievers - std::vector> retriever_futures; - for(size_t i = 0; i < num_retrieve_workers; i++) - { - - hpx::lcos::future retriever_future = - hpx::async(retrieve_worker_main, - (intptr_t) this, - verbose); - - retriever_futures.push_back(std::move(retriever_future)); - - } - - // combining all retrievers into one future - retrievers_finished = hpx::when_all(retriever_futures).share(); - +image_generator::image_generator(size_t img_size_hint_x_, + size_t img_size_hint_y_, + size_t num_parallel_kernels, bool verbose_, + std::vector devices) + : next_image_id(0), + verbose(verbose_), + img_size_hint_x(img_size_hint_x_), + img_size_hint_y(img_size_hint_y_) { + // one retrieve worker for every os thread + size_t num_retrieve_workers = hpx::get_os_thread_count(); + + // create workqueue + workqueue = std::make_shared>>(); + + // initialize worker list + workers = std::make_shared>>(); + + // starting workers + for (auto& device : devices) { + // add a worker + add_worker(device, num_parallel_kernels); + } + + // starting retrievers + std::vector> retriever_futures; + for (size_t i = 0; i < num_retrieve_workers; i++) { + hpx::lcos::future retriever_future = + hpx::async(retrieve_worker_main, (intptr_t)this, verbose); + + retriever_futures.push_back(std::move(retriever_future)); + } + + // combining all retrievers into one future + retrievers_finished = hpx::when_all(retriever_futures).share(); } - -image_generator:: -~image_generator() -{ - - // wait for work to get finished - shutdown(); - +image_generator::~image_generator() { + // wait for work to get finished + shutdown(); } -void -image_generator:: -add_worker(hpx::opencl::device & device, size_t num_parallel_kernels) -{ - - // create request callback function for worker - boost::function*)> request_new_work = - boost::bind(&work_queue>::request, - &(*workqueue), - _1); - - // create deliver callback function for worker - boost::function&)> deliver_done_work = - boost::bind(&work_queue>::deliver, - &(*workqueue), - _1); - - - // create worker - std::shared_ptr worker = - std::make_shared - (device, - num_parallel_kernels, - request_new_work, - deliver_done_work, - verbose, - img_size_hint_x, - img_size_hint_y); - - // add worker to workerlist - workers->push_back(worker); - +void image_generator::add_worker(hpx::opencl::device& device, + size_t num_parallel_kernels) { + // create request callback function for worker + boost::function*)> request_new_work = + boost::bind(&work_queue>::request, + &(*workqueue), _1); + + // create deliver callback function for worker + boost::function&)> deliver_done_work = + boost::bind(&work_queue>::deliver, + &(*workqueue), _1); + + // create worker + std::shared_ptr worker = std::make_shared( + device, num_parallel_kernels, request_new_work, deliver_done_work, + verbose, img_size_hint_x, img_size_hint_y); + + // add worker to workerlist + workers->push_back(worker); } -void -image_generator:: -wait_for_startup_finished() -{ - - // wait for all workers to finish startup - for( auto& worker : *workers) - { - - worker->wait_for_startup_finished(); - - } - - +void image_generator::wait_for_startup_finished() { + // wait for all workers to finish startup + for (auto& worker : *workers) { + worker->wait_for_startup_finished(); + } } -void -image_generator:: -shutdown() -{ - - // end workqueue - workqueue->finish(); - - // wait for retrievers to finish - retrievers_finished.wait(); +void image_generator::shutdown() { + // end workqueue + workqueue->finish(); + // wait for retrievers to finish + retrievers_finished.wait(); } -void -image_generator:: -retrieve_worker_main(intptr_t parent_, bool verbose) -{ +void image_generator::retrieve_worker_main(intptr_t parent_, bool verbose) { + // get parent pointer + image_generator* parent = (image_generator*)parent_; - // get parent pointer - image_generator* parent = (image_generator*) parent_; - - // represents done workload - std::shared_ptr done_workload; - - // main loop - if(verbose) hpx::cout << "entering retrieve worker main loop ..." << hpx::endl; - while(parent->workqueue->retrieve_finished_work(&done_workload)) - { + // represents done workload + std::shared_ptr done_workload; - // retrieve id of associated image - size_t img_id = done_workload->img_id; + // main loop + if (verbose) + hpx::cout << "entering retrieve worker main loop ..." << hpx::endl; + while (parent->workqueue->retrieve_finished_work(&done_workload)) { + // retrieve id of associated image + size_t img_id = done_workload->img_id; - // image data - std::shared_ptr> img_data; - - // image countdown - std::shared_ptr> img_countdown; - - // image event lock - std::shared_ptr img_ready; - - // retrieve image pointers - { - // lock - std::unique_lock - lock(parent->images_lock); - - // retrieve image data - image_data_map::iterator data_iterator = parent->images.find(img_id); - // leave as null pointer if no data exists. - // this indicates benchmark mode. - if(data_iterator != parent->images.end()) - img_data = data_iterator->second; + // image data + std::shared_ptr> img_data; - // retrieve image countdown - BOOST_ASSERT(parent->images_countdown.find(img_id) - != parent->images_countdown.end()); - img_countdown = parent->images_countdown[img_id]; + // image countdown + std::shared_ptr> img_countdown; - // retrieve image event lock - BOOST_ASSERT(parent->images_ready.find(img_id) - != parent->images_ready.end()); - img_ready = parent->images_ready[img_id]; - } + // image event lock + std::shared_ptr img_ready; - // copy data to img_data - if(img_data) - { - size_t start_x = done_workload->pos_in_img_x; - size_t start_y = done_workload->pos_in_img_y; - size_t size_x = done_workload->num_pixels_x; - size_t size_y = done_workload->num_pixels_y; - size_t line_offset = done_workload->line_offset; - for(size_t y = 0; y < size_y; y++) - { - for(size_t x = 0; x < size_x; x++) - { - (*img_data)[((y + start_y) * line_offset + (x + start_x)) * 3 + 0] = - done_workload->pixeldata[(y * size_x + x) * 3 + 0]; - (*img_data)[((y + start_y) * line_offset + (x + start_x)) * 3 + 1] = - done_workload->pixeldata[(y * size_x + x) * 3 + 1]; - (*img_data)[((y + start_y) * line_offset + (x + start_x)) * 3 + 2] = - done_workload->pixeldata[(y * size_x + x) * 3 + 2]; - } - } - } + // retrieve image pointers + { + // lock + std::unique_lock lock(parent->images_lock); + + // retrieve image data + image_data_map::iterator data_iterator = parent->images.find(img_id); + // leave as null pointer if no data exists. + // this indicates benchmark mode. + if (data_iterator != parent->images.end()) + img_data = data_iterator->second; + + // retrieve image countdown + BOOST_ASSERT(parent->images_countdown.find(img_id) != + parent->images_countdown.end()); + img_countdown = parent->images_countdown[img_id]; + + // retrieve image event lock + BOOST_ASSERT(parent->images_ready.find(img_id) != + parent->images_ready.end()); + img_ready = parent->images_ready[img_id]; + } - // decrease the number of work packets left - size_t current_img_countdown = --(*img_countdown); - if(verbose) hpx::cout << "retrieved workload " << current_img_countdown << ": " - << done_workload->pos_in_img_x - << ":" - << done_workload->pos_in_img_y - << hpx::endl; - - - // if no work packet left (img finished), then: - if(current_img_countdown == 0) - { - // set the image ready event lock - img_ready->set(); - - // lock the data lists - std::unique_lock - lock(parent->images_lock); - - // remove image data. - // data will still be available for waiting image thread, - // as it is a shared_ptr. - image_data_map::iterator data_it = parent->images.find(img_id); - if(data_it != parent->images.end()) - parent->images.erase(data_it); - - // remove countdown variable - image_countdown_map::iterator countdown_it = - parent->images_countdown.find(img_id); - parent->images_countdown.erase(countdown_it); - - // remove event lock - image_ready_map::iterator ready_it = - parent->images_ready.find(img_id); - parent->images_ready.erase(ready_it); + // copy data to img_data + if (img_data) { + size_t start_x = done_workload->pos_in_img_x; + size_t start_y = done_workload->pos_in_img_y; + size_t size_x = done_workload->num_pixels_x; + size_t size_y = done_workload->num_pixels_y; + size_t line_offset = done_workload->line_offset; + for (size_t y = 0; y < size_y; y++) { + for (size_t x = 0; x < size_x; x++) { + (*img_data)[((y + start_y) * line_offset + (x + start_x)) * 3 + 0] = + done_workload->pixeldata[(y * size_x + x) * 3 + 0]; + (*img_data)[((y + start_y) * line_offset + (x + start_x)) * 3 + 1] = + done_workload->pixeldata[(y * size_x + x) * 3 + 1]; + (*img_data)[((y + start_y) * line_offset + (x + start_x)) * 3 + 2] = + done_workload->pixeldata[(y * size_x + x) * 3 + 2]; } - + } } + // decrease the number of work packets left + size_t current_img_countdown = --(*img_countdown); + if (verbose) + hpx::cout << "retrieved workload " << current_img_countdown << ": " + << done_workload->pos_in_img_x << ":" + << done_workload->pos_in_img_y << hpx::endl; + + // if no work packet left (img finished), then: + if (current_img_countdown == 0) { + // set the image ready event lock + img_ready->set(); + + // lock the data lists + std::unique_lock lock(parent->images_lock); + + // remove image data. + // data will still be available for waiting image thread, + // as it is a shared_ptr. + image_data_map::iterator data_it = parent->images.find(img_id); + if (data_it != parent->images.end()) parent->images.erase(data_it); + + // remove countdown variable + image_countdown_map::iterator countdown_it = + parent->images_countdown.find(img_id); + parent->images_countdown.erase(countdown_it); + + // remove event lock + image_ready_map::iterator ready_it = parent->images_ready.find(img_id); + parent->images_ready.erase(ready_it); + } + } } // waits until event lock triggered, then returns data -std::shared_ptr> -wait_for_image_finished(std::shared_ptr img_ready, - std::shared_ptr> img_data) -{ - - // wait for the event lock to trigger - img_ready->wait(); - - // return the image data - return img_data; - +std::shared_ptr> wait_for_image_finished( + std::shared_ptr img_ready, + std::shared_ptr> img_data) { + // wait for the event lock to trigger + img_ready->wait(); + + // return the image data + return img_data; } hpx::lcos::future>> -image_generator:: -compute_image(double posx, - double posy, - double zoom, - double rotation, - size_t img_width, - size_t img_height) -{ - - return compute_image(posx, posy, zoom, rotation, - img_width, img_height, - false, img_width, 1); - +image_generator::compute_image(double posx, double posy, double zoom, + double rotation, size_t img_width, + size_t img_height) { + return compute_image(posx, posy, zoom, rotation, img_width, img_height, false, + img_width, 1); } hpx::lcos::future>> -image_generator:: -compute_image(double posx, - double posy, - double zoom, - double rotation, - size_t img_width, - size_t img_height, - bool benchmark, - size_t tile_width, - size_t tile_height) -{ - - // calculate image id - size_t img_id = next_image_id++; - - // calculate aspect ratio - double aspect_ratio = (double) img_width / (double) img_height; - - // calculate size of diagonale - //double size_diag = exp2(-zoom) * 4.0; - double size_diag = 4.0 / zoom; - - // calculate width and height - double size_y = size_diag / sqrt( 1 + aspect_ratio * aspect_ratio ); - double size_x = aspect_ratio * size_y; - - // calculate horizontal stepwidth - double hor_pixdist_nonrot = size_x / (img_width - 1); - double hor_pixdist_x = cos(rotation) * hor_pixdist_nonrot; - double hor_pixdist_y = sin(rotation) * hor_pixdist_nonrot; - - // calculate vertical stepwidth - double vert_pixdist_nonrot = - size_y / (img_height - 1); - double vert_pixdist_x = - sin(rotation) * vert_pixdist_nonrot; - double vert_pixdist_y = cos(rotation) * vert_pixdist_nonrot; - - - // calculate top left coords - double topleft_x = posx - hor_pixdist_x * ( img_width / 2.0 + 0.5 ) - - vert_pixdist_x * ( img_height / 2.0 + 0.5 ); - double topleft_y = posy - hor_pixdist_y * ( img_width / 2.0 + 0.5 ) - - vert_pixdist_y * ( img_height / 2.0 + 0.5 ); - - // calculate number of tiles - BOOST_ASSERT(img_width % tile_width == 0 && img_height % tile_height == 0); - size_t num_tiles_x = img_width / tile_width; - size_t num_tiles_y = img_height / tile_height; - - if(verbose){ - hpx::cout << "image data" << hpx::endl +image_generator::compute_image(double posx, double posy, double zoom, + double rotation, size_t img_width, + size_t img_height, bool benchmark, + size_t tile_width, size_t tile_height) { + // calculate image id + size_t img_id = next_image_id++; + + // calculate aspect ratio + double aspect_ratio = (double)img_width / (double)img_height; + + // calculate size of diagonale + // double size_diag = exp2(-zoom) * 4.0; + double size_diag = 4.0 / zoom; + + // calculate width and height + double size_y = size_diag / sqrt(1 + aspect_ratio * aspect_ratio); + double size_x = aspect_ratio * size_y; + + // calculate horizontal stepwidth + double hor_pixdist_nonrot = size_x / (img_width - 1); + double hor_pixdist_x = cos(rotation) * hor_pixdist_nonrot; + double hor_pixdist_y = sin(rotation) * hor_pixdist_nonrot; + + // calculate vertical stepwidth + double vert_pixdist_nonrot = -size_y / (img_height - 1); + double vert_pixdist_x = -sin(rotation) * vert_pixdist_nonrot; + double vert_pixdist_y = cos(rotation) * vert_pixdist_nonrot; + + // calculate top left coords + double topleft_x = posx - hor_pixdist_x * (img_width / 2.0 + 0.5) - + vert_pixdist_x * (img_height / 2.0 + 0.5); + double topleft_y = posy - hor_pixdist_y * (img_width / 2.0 + 0.5) - + vert_pixdist_y * (img_height / 2.0 + 0.5); + + // calculate number of tiles + BOOST_ASSERT(img_width % tile_width == 0 && img_height % tile_height == 0); + size_t num_tiles_x = img_width / tile_width; + size_t num_tiles_y = img_height / tile_height; + + if (verbose) { + hpx::cout << "image data" << hpx::endl << "topleft: " << topleft_x << ":" << topleft_y << hpx::endl - << "img_dims: " << img_width << ":" << img_height << hpx::endl + << "img_dims: " << img_width << ":" << img_height + << hpx::endl << "pos: " << posx << ":" << posy << hpx::endl << "size: " << size_x << ":" << size_y << hpx::endl - << "hor_pixdist: " << hor_pixdist_x << ":" << hor_pixdist_y << hpx::endl - << "vert_pixdist: " << vert_pixdist_x << ":" << vert_pixdist_y << hpx::endl - << "num_tiles: " << num_tiles_x << ":" << num_tiles_y << hpx::endl; - } - - // create data array to hold finished image, if we are not in benchmark mode - std::shared_ptr> img_data; - if(!benchmark) - img_data = std::make_shared > - (img_width * img_height * 3 * sizeof(char)); - - // create a new countdown variable - std::shared_ptr> img_countdown = - std::make_shared>(num_tiles_x * num_tiles_y); - - // create a new ready event lock - std::shared_ptr img_ready = - std::make_shared(); - - // add the created variables to their lists - { - std::unique_lock - lock(images_lock); - - // do not add data in benchmark mode - if(!benchmark) - images.insert(std::pair>> - (img_id, img_data)); - images_countdown.insert(std::pair>> - (img_id, img_countdown)); - images_ready.insert(std::pair> - (img_id, img_ready)); + << "hor_pixdist: " << hor_pixdist_x << ":" << hor_pixdist_y + << hpx::endl + << "vert_pixdist: " << vert_pixdist_x << ":" << vert_pixdist_y + << hpx::endl + << "num_tiles: " << num_tiles_x << ":" << num_tiles_y + << hpx::endl; + } + + // create data array to hold finished image, if we are not in benchmark mode + std::shared_ptr> img_data; + if (!benchmark) + img_data = std::make_shared>(img_width * img_height * 3 * + sizeof(char)); + + // create a new countdown variable + std::shared_ptr> img_countdown = + std::make_shared>(num_tiles_x * num_tiles_y); + + // create a new ready event lock + std::shared_ptr img_ready = + std::make_shared(); + + // add the created variables to their lists + { + std::unique_lock lock(images_lock); + + // do not add data in benchmark mode + if (!benchmark) + images.insert(std::pair>>( + img_id, img_data)); + images_countdown.insert( + std::pair>>(img_id, + img_countdown)); + images_ready.insert( + std::pair>(img_id, + img_ready)); + } + + // add the workloads to queue + if (verbose) hpx::cout << "Adding workloads to queue ..." << hpx::endl; + for (size_t y = 0; y < img_height; y += tile_height) { + for (size_t x = 0; x < img_width; x += tile_width) { + if (verbose) + hpx::cout << "\tAdding workload " << x << ":" << y << " ..." + << hpx::endl; + + // calculate position of current work packet + double workpacket_pos_x = + topleft_x + vert_pixdist_x * y + hor_pixdist_x * x; + double workpacket_pos_y = + topleft_y + vert_pixdist_y * y + hor_pixdist_y * x; + // add workload + std::shared_ptr row = std::make_shared( + tile_width, tile_height, workpacket_pos_x, workpacket_pos_y, + hor_pixdist_x, hor_pixdist_y, vert_pixdist_x, vert_pixdist_y, img_id, + x, y, img_width); + workqueue->add_work(row); } + } - - // add the workloads to queue - if (verbose) hpx::cout << "Adding workloads to queue ..." << hpx::endl; - for(size_t y = 0; y < img_height; y += tile_height) - { - for(size_t x = 0; x < img_width; x += tile_width) - { - if (verbose) hpx::cout << "\tAdding workload " << x << ":" << y << " ..." << hpx::endl; - - // calculate position of current work packet - double workpacket_pos_x = topleft_x + vert_pixdist_x * y + hor_pixdist_x * x; - double workpacket_pos_y = topleft_y + vert_pixdist_y * y + hor_pixdist_y * x; - // add workload - std::shared_ptr row = - std::make_shared(tile_width, - tile_height, - workpacket_pos_x, - workpacket_pos_y, - hor_pixdist_x, - hor_pixdist_y, - vert_pixdist_x, - vert_pixdist_y, - img_id, - x, - y, - img_width); - workqueue->add_work(row); - } - } - - // return the future to the finished image - return hpx::async(wait_for_image_finished, img_ready, img_data); - + // return the future to the finished image + return hpx::async(wait_for_image_finished, img_ready, img_data); } diff --git a/examples/opencl/mandelbrot/image_generator.hpp b/examples/opencl/mandelbrot/image_generator.hpp index 41a69ba6..22bf59cd 100644 --- a/examples/opencl/mandelbrot/image_generator.hpp +++ b/examples/opencl/mandelbrot/image_generator.hpp @@ -24,85 +24,66 @@ * this class is the main observer of the image generation. * it gets image queries, which it then splits into subimages and sends it to * the calculation queue. - * it then collects the calculated data and puts it together, to have one finished image. + * it then collects the calculated data and puts it together, to have one + * finished image. */ -class image_generator -{ - - public: - // initializes the image generator - image_generator(size_t img_size_hint_x, - size_t img_size_hint_y, - size_t num_parallel_kernels, - bool verbose, - std::vector devices - = std::vector()); - - // destructor - ~image_generator(); - - // waits for the worker to finish - void shutdown(); - - // adds a worker - void add_worker(hpx::opencl::device & device, - size_t num_parallel_kernels); - - // waits for the worker to finish initialization - void wait_for_startup_finished(); - - // computes an image - hpx::lcos::future>> - compute_image(double pos_x, - double pos_y, - double zoom, - double rotation, - size_t img_width, - size_t img_height); - - // computes an image, enhanced version - hpx::lcos::future>> - compute_image(double pos_x, - double pos_y, - double zoom, - double rotation, - size_t img_width, - size_t img_height, - bool benchmark, // purges output - size_t tile_width, - size_t tile_height); - - private: - // the main worker function, runs the main work loop - static void retrieve_worker_main( - intptr_t parent_, - bool verbose); - - - // private attributes - private: - hpx::lcos::shared_future retrievers_finished; - std::shared_ptr>> workqueue; - std::shared_ptr>> workers; - hpx::lcos::local::spinlock images_lock; - - typedef std::map>> - image_data_map; - typedef std::map>> - image_countdown_map; - typedef std::map> - image_ready_map; - image_data_map images; - image_countdown_map images_countdown; - image_ready_map images_ready; - - std::atomic next_image_id; - bool verbose; - - size_t img_size_hint_x; - size_t img_size_hint_y; - +class image_generator { + public: + // initializes the image generator + image_generator(size_t img_size_hint_x, size_t img_size_hint_y, + size_t num_parallel_kernels, bool verbose, + std::vector devices = + std::vector()); + + // destructor + ~image_generator(); + + // waits for the worker to finish + void shutdown(); + + // adds a worker + void add_worker(hpx::opencl::device& device, size_t num_parallel_kernels); + + // waits for the worker to finish initialization + void wait_for_startup_finished(); + + // computes an image + hpx::lcos::future>> compute_image( + double pos_x, double pos_y, double zoom, double rotation, + size_t img_width, size_t img_height); + + // computes an image, enhanced version + hpx::lcos::future>> compute_image( + double pos_x, double pos_y, double zoom, double rotation, + size_t img_width, size_t img_height, + bool benchmark, // purges output + size_t tile_width, size_t tile_height); + + private: + // the main worker function, runs the main work loop + static void retrieve_worker_main(intptr_t parent_, bool verbose); + + // private attributes + private: + hpx::lcos::shared_future retrievers_finished; + std::shared_ptr>> workqueue; + std::shared_ptr>> workers; + hpx::lcos::local::spinlock images_lock; + + typedef std::map>> image_data_map; + typedef std::map>> + image_countdown_map; + typedef std::map> + image_ready_map; + image_data_map images; + image_countdown_map images_countdown; + image_ready_map images_ready; + + std::atomic next_image_id; + bool verbose; + + size_t img_size_hint_x; + size_t img_size_hint_y; }; #endif - diff --git a/examples/opencl/mandelbrot/main.cpp b/examples/opencl/mandelbrot/main.cpp index 6f0bcc13..1e8d916f 100644 --- a/examples/opencl/mandelbrot/main.cpp +++ b/examples/opencl/mandelbrot/main.cpp @@ -18,311 +18,236 @@ #include #include +void grainsize_bench(std::vector devices, size_t img_x, + size_t img_y, size_t num_iterations, size_t num_kernels, + bool verbose) { + // default benchmark image + double posx = -0.743643887037151; + double posy = 0.131825904205330; + double zoom = 6.2426215349789484160e10; + + hpx::cerr << "Starting in benchmark mode." << hpx::endl; + + // create image generator without gpus + image_generator img_gen(1, 1, num_kernels, verbose, devices); + + img_gen.wait_for_startup_finished(); + + // iterate through all configurations + for (size_t grainsize = 256; grainsize <= img_x * img_y; grainsize *= 2) { + hpx::cerr << "Starting test with grainsize " << grainsize << " ..." + << hpx::endl; + + // calculate tile size + size_t tilesize_x, tilesize_y; + if (grainsize < img_x) { + tilesize_y = 1; + tilesize_x = grainsize; + } else { + tilesize_x = img_x; + tilesize_y = grainsize / img_x; + } + hpx::cerr << "Using tilesizes: " << tilesize_x << "x" << tilesize_y + << hpx::endl; + + // initialize timer + double total_time = 0.0; + + // main benchmark loop + for (size_t i = 0; i < num_iterations + 1; i++) { + if (i == 0) { + hpx::cerr << "Warmup iteration ..." << hpx::endl; + } + if (i >= 1) { + hpx::cerr << "Starting benchmark iteration " << i << "/" + << num_iterations << " ..." << hpx::endl; + + // start time measurement + timer_start(); + } + + img_gen + .compute_image(posx, posy, zoom, 0.0, img_x, img_y, true, tilesize_x, + tilesize_y) + .get(); + + if (i >= 1) { + // measure time + total_time += timer_stop(); + } + } -void grainsize_bench(std::vector devices, - size_t img_x, size_t img_y, - size_t num_iterations, size_t num_kernels, bool verbose) -{ - - // default benchmark image - double posx = -0.743643887037151; - double posy = 0.131825904205330; - double zoom = 6.2426215349789484160e10; - - hpx::cerr << "Starting in benchmark mode." << hpx::endl; - - // create image generator without gpus - image_generator img_gen(1, 1, num_kernels, verbose, devices); + // calculate average time + double time = total_time / (double)num_iterations; - img_gen.wait_for_startup_finished(); + hpx::cerr << "Time: " << time << " ms" << hpx::endl; + hpx::cout << grainsize << "\t" << time << hpx::endl; + } - // iterate through all configurations - for(size_t grainsize = 256; grainsize <= img_x*img_y; grainsize *= 2) - { - hpx::cerr << "Starting test with grainsize " << grainsize << " ..." - << hpx::endl; + hpx::cerr << "Done." << hpx::endl; + img_gen.shutdown(); +} - // calculate tile size - size_t tilesize_x, tilesize_y; - if(grainsize < img_x) - { - tilesize_y = 1; - tilesize_x = grainsize; - } - else - { - tilesize_x = img_x; - tilesize_y = grainsize/img_x; - } +void speedup_bench(std::vector devices, size_t tilesize_x, + size_t tilesize_y, size_t img_x, size_t img_y, + size_t num_iterations, size_t num_kernels, bool verbose) { + // default benchmark image + double posx = -0.743643887037151; + double posy = 0.131825904205330; + double zoom = 6.2426215349789484160e10; - hpx::cerr << "Using tilesizes: " << tilesize_x << "x" - << tilesize_y << hpx::endl; + hpx::cerr << "Starting in benchmark mode." << hpx::endl; - // initialize timer - double total_time = 0.0; + // create image generator without gpus + image_generator img_gen(tilesize_x, tilesize_y, num_kernels, verbose); - // main benchmark loop - for(size_t i = 0; i < num_iterations + 1; i++) - { - if(i == 0) - { - hpx::cerr << "Warmup iteration ..." << hpx::endl; - } - if(i >= 1) - { - hpx::cerr << "Starting benchmark iteration " << i << "/" - << num_iterations << " ..." << hpx::endl; + // save the time for single-gpu + double single_gpu_time = 1.0f; - // start time measurement - timer_start(); - } + for (size_t num_gpus = 1; num_gpus <= devices.size(); num_gpus++) { + hpx::cerr << "Starting test with " << num_gpus << " gpus ..." << hpx::endl; - img_gen.compute_image(posx, - posy, - zoom, - 0.0, - img_x, - img_y, - true, - tilesize_x, - tilesize_y).get(); + // Add another worker + if (verbose) { + hpx::cerr << "adding worker ..." << hpx::endl; + } + img_gen.add_worker(devices[devices.size() - num_gpus], 4); - if(i >= 1) - { + // Wait for the worker to initialize + if (verbose) { + hpx::cerr << "waiting for worker to finish startup ..." << hpx::endl; + } + img_gen.wait_for_startup_finished(); + + // main benchmark loop + for (size_t i = 0; i < num_iterations + 1; i++) { + // start timer after first iteration (warmup iteration) + if (i == 1) timer_start(); + img_gen + .compute_image(posx, posy, zoom, 0.0, img_x, img_y, true, tilesize_x, + tilesize_y) + .get(); + } - // measure time - total_time += timer_stop(); + // stop timer + double time = timer_stop(); + time = time / (double)num_iterations - 1; - } - } + // save time if we only have one gpu + if (num_gpus == 1) single_gpu_time = time; - // calculate average time - double time = total_time / (double)num_iterations; + hpx::cerr << "Time: " << time << " ms" << hpx::endl; + hpx::cout << num_gpus << "\t" << time << "\t" << (single_gpu_time / time) + << "\t" << ((single_gpu_time / time) / (double)num_gpus) + << hpx::endl; + } - hpx::cerr << "Time: " << time << " ms" << hpx::endl; - hpx::cout << grainsize - << "\t" << time - << hpx::endl; + hpx::cerr << "Done." << hpx::endl; + img_gen.shutdown(); +} - } +int hpx_main(boost::program_options::variables_map& vm) { + std::size_t num_kernels = 0; + bool verbose = false; + bool benchmark = false; + + // Print help message on wrong argument count + if (vm.count("num-parallel-kernels")) + num_kernels = vm["num-parallel-kernels"].as(); + if (vm.count("v")) verbose = true; + if (vm.count("bench")) benchmark = true; + + // The main scope + { + // get all devices + std::vector devices = + hpx::opencl::create_all_devices( + CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, "OpenCL 1.1") + .get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No OpenCL devices found!" << hpx::endl; + return hpx::finalize(); + } else { + hpx::cerr << devices.size() << " OpenCL devices found!" << hpx::endl; + } - hpx::cerr << "Done." << hpx::endl; - img_gen.shutdown(); + // double posx = -0.7; + // double posy = 0.0; + // double zoom = 1.04; + ////double zoom = 0.05658352842407526628; + double posx = -0.743643887037151; + double posy = 0.131825904205330; + double zoom = 6.2426215349789484160e10; + // double zoom = 35.8603219463046942295; + size_t img_x = 3840; + size_t img_y = 2160; + if (!benchmark) { + // create image_generator + image_generator img_gen(img_x, 8, num_kernels, verbose, devices); -} + // wait for workers to finish initialization + if (verbose) + hpx::cout << "waiting for workers to finish startup ..." << hpx::endl; + img_gen.wait_for_startup_finished(); -void speedup_bench(std::vector devices, - size_t tilesize_x, size_t tilesize_y, size_t img_x, size_t img_y, - size_t num_iterations, size_t num_kernels, bool verbose) -{ - - // default benchmark image - double posx = -0.743643887037151; - double posy = 0.131825904205330; - double zoom = 6.2426215349789484160e10; - - hpx::cerr << "Starting in benchmark mode." << hpx::endl; - - // create image generator without gpus - image_generator img_gen(tilesize_x, tilesize_y, num_kernels, verbose); - - // save the time for single-gpu - double single_gpu_time = 1.0f; - - for(size_t num_gpus = 1; num_gpus <= devices.size(); num_gpus++) - { - hpx::cerr << "Starting test with " << num_gpus << " gpus ..." - << hpx::endl; - - // Add another worker - if(verbose){ - hpx::cerr << "adding worker ..." - << hpx::endl; - } - img_gen.add_worker(devices[devices.size() - num_gpus], 4); - - // Wait for the worker to initialize - if(verbose){ - hpx::cerr << "waiting for worker to finish startup ..." - << hpx::endl; - } - img_gen.wait_for_startup_finished(); - - // main benchmark loop - for(size_t i = 0; i < num_iterations + 1; i++) - { - // start timer after first iteration (warmup iteration) - if(i == 1) timer_start(); - img_gen.compute_image(posx, - posy, - zoom, - 0.0, - img_x, - img_y, - true, - tilesize_x, - tilesize_y).get(); - } - - // stop timer - double time = timer_stop(); - time = time / (double)num_iterations - 1; - - // save time if we only have one gpu - if(num_gpus == 1) - single_gpu_time = time; - - hpx::cerr << "Time: " << time << " ms" << hpx::endl; - hpx::cout << num_gpus - << "\t" << time - << "\t" << (single_gpu_time / time) - << "\t" << ((single_gpu_time / time) / (double)num_gpus) - << hpx::endl; - - } - - hpx::cerr << "Done." << hpx::endl; - img_gen.shutdown(); + // start timer + timer_start(); + // queue image + std::shared_ptr> img_data = + img_gen + .compute_image(posx, posy, zoom, 0.0, img_x, img_y, false, img_x, + 4) + .get(); + // stop timer + double time = timer_stop(); + hpx::cout << "time: " << time << " ms" << hpx::endl; -} + // end the image generator + img_gen.shutdown(); + // save the png + save_png(img_data, img_x, img_y, "test.png"); -int hpx_main(boost::program_options::variables_map & vm) -{ - - std::size_t num_kernels = 0; - bool verbose = false; - bool benchmark = false; - - // Print help message on wrong argument count - if (vm.count("num-parallel-kernels")) - num_kernels = vm["num-parallel-kernels"].as(); - if (vm.count("v")) - verbose = true; - if (vm.count("bench")) - benchmark = true; - - // The main scope - { - - // get all devices - std::vector devices = - hpx::opencl::create_all_devices( - CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, - "OpenCL 1.1").get(); - - // Check whether there are any devices - if(devices.size() < 1) - { - hpx::cerr << "No OpenCL devices found!" << hpx::endl; - return hpx::finalize(); - } - else - { - hpx::cerr << devices.size() << " OpenCL devices found!" << hpx::endl; - } - - - //double posx = -0.7; - //double posy = 0.0; - //double zoom = 1.04; - ////double zoom = 0.05658352842407526628; - - double posx = -0.743643887037151; - double posy = 0.131825904205330; - double zoom = 6.2426215349789484160e10; - //double zoom = 35.8603219463046942295; - - size_t img_x = 3840; - size_t img_y = 2160; - - if(!benchmark) - { - // create image_generator - image_generator img_gen(img_x, 8, num_kernels, verbose, devices); - - // wait for workers to finish initialization - if(verbose) hpx::cout << "waiting for workers to finish startup ..." << hpx::endl; - img_gen.wait_for_startup_finished(); - - // start timer - timer_start(); - - // queue image - std::shared_ptr> img_data = - img_gen.compute_image(posx, - posy, - zoom, - 0.0, - img_x, - img_y, - false, - img_x, - 4).get(); - - // stop timer - double time = timer_stop(); - - hpx::cout << "time: " << time << " ms" << hpx::endl; - - // end the image generator - img_gen.shutdown(); - - // save the png - save_png(img_data, img_x, img_y, "test.png"); - - } else { - - /*speedup_bench(devices, - 3840, 8, - 3840, 2160, - 10, - 4, - verbose); + } else { + /*speedup_bench(devices, + 3840, 8, + 3840, 2160, + 10, + 4, + verbose); */ - grainsize_bench(devices, - 2048, 1024, - 10, - 4, - verbose); - } - + grainsize_bench(devices, 2048, 1024, 10, 4, verbose); } + } - if(verbose) hpx::cout << "Program finished." << hpx::endl; - - // End the program - return hpx::finalize(); + if (verbose) hpx::cout << "Program finished." << hpx::endl; + // End the program + return hpx::finalize(); } +////////////////////////////////////////////////////////////////////////////// +int main(int argc, char* argv[]) { + // Configure application-specific options + boost::program_options::options_description cmdline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + cmdline.add_options()( + "num-parallel-kernels", + boost::program_options::value()->default_value(4), + "the number of parallel kernel invocations per gpu"); + + cmdline.add_options()("v", "verbose output"); + cmdline.add_options()("bench", "runs benchmark"); -////////////////////////////////////////////////////////////////////////////// -int main(int argc, char* argv[]) -{ - // Configure application-specific options - boost::program_options::options_description cmdline( - "Usage: " HPX_APPLICATION_STRING " [options]"); - cmdline.add_options() - ( "num-parallel-kernels" - , boost::program_options::value()->default_value(4) - , "the number of parallel kernel invocations per gpu") ; - - cmdline.add_options() - ( "v" - , "verbose output") ; - - cmdline.add_options() - ( "bench" - , "runs benchmark") ; - - return hpx::init(cmdline, argc, argv); + return hpx::init(cmdline, argc, argv); } diff --git a/examples/opencl/mandelbrot/mandelbrotkernel.hpp b/examples/opencl/mandelbrot/mandelbrotkernel.hpp index e2650e69..d7891630 100644 --- a/examples/opencl/mandelbrot/mandelbrotkernel.hpp +++ b/examples/opencl/mandelbrot/mandelbrotkernel.hpp @@ -7,7 +7,6 @@ #define MANDELBROT_MANDELBROTKERNEL_H_ extern const char mandelbrotkernel_cl[]; -extern const unsigned long mandelbrotkernel_cl_len; +extern const unsigned long mandelbrotkernel_cl_len; #endif - diff --git a/examples/opencl/mandelbrot/mandelbrotworker.cpp b/examples/opencl/mandelbrot/mandelbrotworker.cpp index 8a27c06d..13fba327 100644 --- a/examples/opencl/mandelbrot/mandelbrotworker.cpp +++ b/examples/opencl/mandelbrot/mandelbrotworker.cpp @@ -17,323 +17,260 @@ static boost::atomic id_counter((unsigned int)0); mandelbrotworker::mandelbrotworker( - hpx::opencl::device device_, - size_t num_workers, - boost::function*)> - request_new_work_, - boost::function &)> - deliver_done_work_, - bool verbose_, - size_t workpacket_size_hint_x, - size_t workpacket_size_hint_y) + hpx::opencl::device device_, size_t num_workers, + boost::function*)> request_new_work_, + boost::function&)> deliver_done_work_, + bool verbose_, size_t workpacket_size_hint_x, size_t workpacket_size_hint_y) : verbose(verbose_), id(id_counter++), device(device_), worker_initialized(std::make_shared()), request_new_work(request_new_work_), - deliver_done_work(deliver_done_work_) -{ + deliver_done_work(deliver_done_work_) { + // start worker + worker_finished = + hpx::async(&mandelbrotworker::worker_starter, this, num_workers, + workpacket_size_hint_x, workpacket_size_hint_y); +} - // start worker - worker_finished = hpx::async(&mandelbrotworker::worker_starter, - this, - num_workers, - workpacket_size_hint_x, - workpacket_size_hint_y); +mandelbrotworker::~mandelbrotworker() { + // wait for the worker to finish + join(); +} +void mandelbrotworker::join() { + // wait for worker to finish + hpx::shared_future tmp = worker_finished; + tmp.wait(); } -mandelbrotworker::~mandelbrotworker() -{ +void mandelbrotworker::wait_for_startup_finished() { + // waits until the worker_starter triggers this event + worker_initialized->wait(); +} - // wait for the worker to finish - join(); +#define KERNEL_INPUT_ARGUMENT_COUNT 6 +size_t mandelbrotworker::worker_main(hpx::opencl::kernel precalc_kernel, + hpx::opencl::kernel kernel, + size_t workpacket_size_hint_x, + size_t workpacket_size_hint_y) { + // setup device memory management. + // initialize default buffer with size of numpixels * 3 (rgb) * sizeof(double) + mandelbrotworker_buffermanager buffermanager( + device, + workpacket_size_hint_x * workpacket_size_hint_y * 3 * sizeof(char), + verbose, CL_MEM_WRITE_ONLY); + + // initialize buffermanager for precalc buffer + mandelbrotworker_buffermanager precalc_buffermanager( + device, + (workpacket_size_hint_x + 2) * (workpacket_size_hint_y + 2) * + sizeof(char), + verbose, CL_MEM_READ_WRITE); + + // counts how much work has been done + size_t num_work = 0; + + // attach output buffer + size_t current_buffer_size = + workpacket_size_hint_x * workpacket_size_hint_y * 3 * sizeof(char); + hpx::opencl::buffer output_buffer = + buffermanager.get_buffer(current_buffer_size); + + // attach precalc buffer + size_t current_precalc_size = (workpacket_size_hint_x + 2) * + (workpacket_size_hint_y + 2) * sizeof(char); + hpx::opencl::buffer precalc_buffer = + precalc_buffermanager.get_buffer(current_precalc_size); + + // create input buffer + hpx::opencl::buffer input_buffer = device.create_buffer( + CL_MEM_READ_ONLY, KERNEL_INPUT_ARGUMENT_COUNT * sizeof(double)); + + // connect buffers to kernel + kernel.set_arg(0, precalc_buffer); + kernel.set_arg(1, output_buffer); + kernel.set_arg(2, input_buffer); + + // connect buffers to precalc kernel + precalc_kernel.set_arg(0, precalc_buffer); + precalc_kernel.set_arg(1, input_buffer); + + // main loop + std::shared_ptr next_workload; + hpx::opencl::work_size<2> dim; + dim[0].offset = 0; + dim[1].offset = 0; + dim[0].local_size = 8; + dim[1].local_size = 8; + + hpx::opencl::work_size<2> precalc_dim; + precalc_dim[0].offset = 0; + precalc_dim[1].offset = 0; + + while (request_new_work(&next_workload)) { + // calculate output buffer size + size_t needed_buffer_size = next_workload->num_pixels_x * + next_workload->num_pixels_y * 3 * sizeof(char); + + // change output buffer if needed buffersize changed + if (current_buffer_size != needed_buffer_size) { + // query new buffer + output_buffer = buffermanager.get_buffer(needed_buffer_size); + + // attach new buffer + kernel.set_arg(1, output_buffer); + + // update current buffer size + current_buffer_size = needed_buffer_size; + } -} + // calculate precalc buffer size + size_t needed_precalc_size = (next_workload->num_pixels_x + 2) * + (next_workload->num_pixels_y + 2) * + sizeof(char); -void -mandelbrotworker::join() -{ + // change precalc buffer if needed precalcsize changed + if (current_precalc_size != needed_precalc_size) { + // query new buffer + precalc_buffer = precalc_buffermanager.get_buffer(needed_precalc_size); - // wait for worker to finish - hpx::shared_future tmp = worker_finished; - tmp.wait(); + // attach new buffer + auto fut = kernel.set_arg_async(0, precalc_buffer); + precalc_kernel.set_arg(0, precalc_buffer); + fut.get(); -} + // update current buffer size + current_precalc_size = needed_precalc_size; + } -void -mandelbrotworker::wait_for_startup_finished() -{ + // read calculation dimensions + double args[KERNEL_INPUT_ARGUMENT_COUNT]; + args[0] = next_workload->topleft_x; + args[1] = next_workload->topleft_y; + args[2] = next_workload->hor_pixdist_x; + args[3] = next_workload->hor_pixdist_y; + args[4] = next_workload->vert_pixdist_x; + args[5] = next_workload->vert_pixdist_y; + typedef hpx::serialization::serialize_buffer double_buffer_type; + double_buffer_type args_buf(args, KERNEL_INPUT_ARGUMENT_COUNT, + double_buffer_type::init_mode::reference); - // waits until the worker_starter triggers this event - worker_initialized->wait(); + // send calculation dimensions to gpu + auto ev1 = input_buffer.enqueue_write(0, args_buf); -} + // run precalculation + precalc_dim[0].size = next_workload->num_pixels_x + 2; + precalc_dim[1].size = next_workload->num_pixels_y + 2; + auto ev2 = precalc_kernel.enqueue(precalc_dim, ev1); -#define KERNEL_INPUT_ARGUMENT_COUNT 6 -size_t -mandelbrotworker::worker_main( - hpx::opencl::kernel precalc_kernel, - hpx::opencl::kernel kernel, - size_t workpacket_size_hint_x, - size_t workpacket_size_hint_y - ) -{ - - // setup device memory management. - // initialize default buffer with size of numpixels * 3 (rgb) * sizeof(double) - mandelbrotworker_buffermanager buffermanager( - device, - workpacket_size_hint_x - * workpacket_size_hint_y - * 3 * sizeof(char), - verbose, - CL_MEM_WRITE_ONLY); - - // initialize buffermanager for precalc buffer - mandelbrotworker_buffermanager precalc_buffermanager( - device, - (workpacket_size_hint_x + 2) - * (workpacket_size_hint_y + 2) - * sizeof(char), - verbose, - CL_MEM_READ_WRITE); - - // counts how much work has been done - size_t num_work = 0; - - // attach output buffer - size_t current_buffer_size = workpacket_size_hint_x - * workpacket_size_hint_y - * 3 * sizeof(char); - hpx::opencl::buffer output_buffer = buffermanager.get_buffer( - current_buffer_size ); - - // attach precalc buffer - size_t current_precalc_size = (workpacket_size_hint_x + 2) - * (workpacket_size_hint_y + 2) - * sizeof(char); - hpx::opencl::buffer precalc_buffer = precalc_buffermanager.get_buffer( - current_precalc_size ); - - // create input buffer - hpx::opencl::buffer input_buffer = device.create_buffer( - CL_MEM_READ_ONLY, - KERNEL_INPUT_ARGUMENT_COUNT * sizeof(double)); - - // connect buffers to kernel - kernel.set_arg(0, precalc_buffer); - kernel.set_arg(1, output_buffer); - kernel.set_arg(2, input_buffer); - - // connect buffers to precalc kernel - precalc_kernel.set_arg(0, precalc_buffer); - precalc_kernel.set_arg(1, input_buffer); - - - // main loop - std::shared_ptr next_workload; - hpx::opencl::work_size<2> dim; - dim[0].offset = 0; - dim[1].offset = 0; - dim[0].local_size = 8; - dim[1].local_size = 8; - - hpx::opencl::work_size<2> precalc_dim; - precalc_dim[0].offset = 0; - precalc_dim[1].offset = 0; - - while(request_new_work(&next_workload)) - { - - // calculate output buffer size - size_t needed_buffer_size = next_workload->num_pixels_x - * next_workload->num_pixels_y - * 3 - * sizeof(char); - - - // change output buffer if needed buffersize changed - if (current_buffer_size != needed_buffer_size) - { - // query new buffer - output_buffer = buffermanager.get_buffer( needed_buffer_size ); - - // attach new buffer - kernel.set_arg(1, output_buffer); - - // update current buffer size - current_buffer_size = needed_buffer_size; - } - - // calculate precalc buffer size - size_t needed_precalc_size = (next_workload->num_pixels_x + 2) - * (next_workload->num_pixels_y + 2) - * sizeof(char); - - // change precalc buffer if needed precalcsize changed - if (current_precalc_size != needed_precalc_size) - { - // query new buffer - precalc_buffer = precalc_buffermanager.get_buffer( - needed_precalc_size ); - - // attach new buffer - auto fut = kernel.set_arg_async(0, precalc_buffer); - precalc_kernel.set_arg(0, precalc_buffer); - fut.get(); - - // update current buffer size - current_precalc_size = needed_precalc_size; - } - - // read calculation dimensions - double args[KERNEL_INPUT_ARGUMENT_COUNT]; - args[0] = next_workload->topleft_x; - args[1] = next_workload->topleft_y; - args[2] = next_workload->hor_pixdist_x; - args[3] = next_workload->hor_pixdist_y; - args[4] = next_workload->vert_pixdist_x; - args[5] = next_workload->vert_pixdist_y; - typedef hpx::serialization::serialize_buffer double_buffer_type; - double_buffer_type args_buf( args, KERNEL_INPUT_ARGUMENT_COUNT, - double_buffer_type::init_mode::reference ); - - // send calculation dimensions to gpu - auto ev1 = input_buffer.enqueue_write(0, args_buf); - - // run precalculation - precalc_dim[0].size = next_workload->num_pixels_x + 2; - precalc_dim[1].size = next_workload->num_pixels_y + 2; - auto ev2 = precalc_kernel.enqueue(precalc_dim, ev1); - - // run calculation - dim[0].size = next_workload->num_pixels_x * 8; - dim[1].size = next_workload->num_pixels_y * 8; - auto ev3 = kernel.enqueue(dim, ev2); - - // query calculation result - auto ev4 = output_buffer.enqueue_read(0, current_buffer_size, ev3); - - // wait for calculation result to arrive - hpx::serialization::serialize_buffer readdata = ev4.get(); - - // copy calculation result to output buffer - next_workload->pixeldata = readdata; + // run calculation + dim[0].size = next_workload->num_pixels_x * 8; + dim[1].size = next_workload->num_pixels_y * 8; + auto ev3 = kernel.enqueue(dim, ev2); - // return calculated workload to work manager workload - deliver_done_work(next_workload); - - // count number of workloads - num_work++; - - } + // query calculation result + auto ev4 = output_buffer.enqueue_read(0, current_buffer_size, ev3); - return num_work; + // wait for calculation result to arrive + hpx::serialization::serialize_buffer readdata = ev4.get(); -} + // copy calculation result to output buffer + next_workload->pixeldata = readdata; -void -mandelbrotworker::worker_starter( - size_t num_workers, - size_t workpacket_size_hint_x, - size_t workpacket_size_hint_y) -{ - - - try{ - - std::string device_vendor = device.get_device_info().get(); - std::string device_name = device.get_device_info().get(); - std::string device_version = device.get_device_info().get(); - - // print device name - hpx::cerr << "#" << id << ": " - << device_vendor << ": " - << device_name << " (" - << device_version << ")" - << hpx::endl; - - // build opencl program - typedef hpx::serialization::serialize_buffer char_buffer_type; - char_buffer_type mandelbrotkernel_buf - ( mandelbrotkernel_cl, mandelbrotkernel_cl_len, - char_buffer_type::init_mode::reference ); - - hpx::opencl::program mandelbrot_program = - device.create_program_with_source(mandelbrotkernel_buf); - if(verbose) - hpx::cout << "#" << id << ": " << "compiling" << hpx::endl; - mandelbrot_program.build(); - if(verbose) - hpx::cout << "#" << id << ": " << "compiling done." << hpx::endl; - - - // start workers - std::vector> worker_futures; - for(size_t i = 0; i < num_workers; i++) - { - - // create kernel - hpx::opencl::kernel kernel = - mandelbrot_program.create_kernel("mandelbrot_alias_8x8"); - - // create precalc kernel - hpx::opencl::kernel precalc_kernel = - mandelbrot_program.create_kernel("precompute_mandelbrot"); - - // start worker - hpx::lcos::future worker_future = - hpx::async(&mandelbrotworker::worker_main, - this, - precalc_kernel, - kernel, - workpacket_size_hint_x, - workpacket_size_hint_y); - - // add worker to workerlist - worker_futures.push_back(std::move(worker_future)); - - } - - if(verbose) - hpx::cout << "#" << id << ": " << "workers started!" << hpx::endl; - - // trigger event to start main function. - // needed for accurate time measurement - worker_initialized->set(); - - // wait for workers to finish - size_t num_work = 0; - for(size_t i = 0; i < num_workers; i++) - { - // finish worker and get number of computed work packets - size_t num_work_single = worker_futures[i].get(); - - // count total work packets - num_work += num_work_single; - } - - if(verbose) - { - hpx::cout << "#" << id << ": " << "workers finished! (" - << num_work << " work packets)" << hpx::endl; - } - - } catch(hpx::exception const& e) { - - // write error message. workaround, should not be done like this in - // real application - hpx::cout << "#" << id << ": " - << "ERROR!" << hpx::endl - << hpx::get_error_backtrace(e) << hpx::endl - << hpx::diagnostic_information(e) << hpx::endl; - - // kill the process. again, not to be done like this in real application. - exit(1); + // return calculated workload to work manager workload + deliver_done_work(next_workload); - } + // count number of workloads + num_work++; + } + return num_work; } +void mandelbrotworker::worker_starter(size_t num_workers, + size_t workpacket_size_hint_x, + size_t workpacket_size_hint_y) { + try { + std::string device_vendor = + device.get_device_info().get(); + std::string device_name = device.get_device_info().get(); + std::string device_version = + device.get_device_info().get(); + + // print device name + hpx::cerr << "#" << id << ": " << device_vendor << ": " << device_name + << " (" << device_version << ")" << hpx::endl; + + // build opencl program + typedef hpx::serialization::serialize_buffer char_buffer_type; + char_buffer_type mandelbrotkernel_buf( + mandelbrotkernel_cl, mandelbrotkernel_cl_len, + char_buffer_type::init_mode::reference); + + hpx::opencl::program mandelbrot_program = + device.create_program_with_source(mandelbrotkernel_buf); + if (verbose) + hpx::cout << "#" << id << ": " + << "compiling" << hpx::endl; + mandelbrot_program.build(); + if (verbose) + hpx::cout << "#" << id << ": " + << "compiling done." << hpx::endl; + + // start workers + std::vector> worker_futures; + for (size_t i = 0; i < num_workers; i++) { + // create kernel + hpx::opencl::kernel kernel = + mandelbrot_program.create_kernel("mandelbrot_alias_8x8"); + + // create precalc kernel + hpx::opencl::kernel precalc_kernel = + mandelbrot_program.create_kernel("precompute_mandelbrot"); + + // start worker + hpx::lcos::future worker_future = + hpx::async(&mandelbrotworker::worker_main, this, precalc_kernel, + kernel, workpacket_size_hint_x, workpacket_size_hint_y); + + // add worker to workerlist + worker_futures.push_back(std::move(worker_future)); + } + + if (verbose) + hpx::cout << "#" << id << ": " + << "workers started!" << hpx::endl; + + // trigger event to start main function. + // needed for accurate time measurement + worker_initialized->set(); + + // wait for workers to finish + size_t num_work = 0; + for (size_t i = 0; i < num_workers; i++) { + // finish worker and get number of computed work packets + size_t num_work_single = worker_futures[i].get(); + // count total work packets + num_work += num_work_single; + } + + if (verbose) { + hpx::cout << "#" << id << ": " + << "workers finished! (" << num_work << " work packets)" + << hpx::endl; + } + + } catch (hpx::exception const& e) { + // write error message. workaround, should not be done like this in + // real application + hpx::cout << "#" << id << ": " + << "ERROR!" << hpx::endl + << hpx::get_error_backtrace(e) << hpx::endl + << hpx::diagnostic_information(e) << hpx::endl; + + // kill the process. again, not to be done like this in real application. + exit(1); + } +} diff --git a/examples/opencl/mandelbrot/mandelbrotworker.hpp b/examples/opencl/mandelbrot/mandelbrotworker.hpp index cd1933b0..dd0a684a 100644 --- a/examples/opencl/mandelbrot/mandelbrotworker.hpp +++ b/examples/opencl/mandelbrot/mandelbrotworker.hpp @@ -23,56 +23,44 @@ * will ask the workqueue for new work until the workqueue finishes. * this is the only class that actually uses the hpxcl. */ -class mandelbrotworker -{ - - public: - // initializes the worker - mandelbrotworker(hpx::opencl::device device_, - size_t num_workers, - boost::function*)> - request_new_work, - boost::function&)> - deliver_done_work, - bool verbose, - size_t workpacket_size_hint_x, - size_t workpacket_size_hint_y); - - // waits for the worker to finish - void join(); - - // waits for the worker to finish initialization - void wait_for_startup_finished(); - - // destructor, basically waits for the worker to finish - ~mandelbrotworker(); - - private: - // the main worker function, runs the main work loop - size_t worker_main( - hpx::opencl::kernel precalc_kernel, - hpx::opencl::kernel kernel, - size_t workpacket_size_hint_x, - size_t workpacket_size_hint_y - ); - - // the startup function, initializes the kernel and starts the workers - void worker_starter( - size_t num_workers, - size_t workpacket_size_hint_x, - size_t workpacket_size_hint_y - ); - - // private attributes - private: - const bool verbose; - const unsigned int id; - hpx::opencl::device device; - hpx::lcos::shared_future worker_finished; - std::shared_ptr worker_initialized; - boost::function*)> request_new_work; - boost::function&)> deliver_done_work; +class mandelbrotworker { + public: + // initializes the worker + mandelbrotworker( + hpx::opencl::device device_, size_t num_workers, + boost::function*)> request_new_work, + boost::function&)> deliver_done_work, + bool verbose, size_t workpacket_size_hint_x, + size_t workpacket_size_hint_y); + + // waits for the worker to finish + void join(); + + // waits for the worker to finish initialization + void wait_for_startup_finished(); + + // destructor, basically waits for the worker to finish + ~mandelbrotworker(); + + private: + // the main worker function, runs the main work loop + size_t worker_main(hpx::opencl::kernel precalc_kernel, + hpx::opencl::kernel kernel, size_t workpacket_size_hint_x, + size_t workpacket_size_hint_y); + + // the startup function, initializes the kernel and starts the workers + void worker_starter(size_t num_workers, size_t workpacket_size_hint_x, + size_t workpacket_size_hint_y); + + // private attributes + private: + const bool verbose; + const unsigned int id; + hpx::opencl::device device; + hpx::lcos::shared_future worker_finished; + std::shared_ptr worker_initialized; + boost::function*)> request_new_work; + boost::function&)> deliver_done_work; }; #endif - diff --git a/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.cpp b/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.cpp index 49cb7840..a25770e5 100644 --- a/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.cpp +++ b/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.cpp @@ -3,64 +3,44 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "mandelbrotworker_buffermanager.hpp" -mandelbrotworker_buffermanager:: -mandelbrotworker_buffermanager(hpx::opencl::device device_, - size_t initial_buffer_size, - bool verbose_, - cl_mem_flags memflags_) - : device(device_), verbose(verbose_), memflags(memflags_) -{ - - // allocate the initial buffer, to improve runtime speed - allocate_buffer(initial_buffer_size); - +mandelbrotworker_buffermanager::mandelbrotworker_buffermanager( + hpx::opencl::device device_, size_t initial_buffer_size, bool verbose_, + cl_mem_flags memflags_) + : device(device_), verbose(verbose_), memflags(memflags_) { + // allocate the initial buffer, to improve runtime speed + allocate_buffer(initial_buffer_size); } -hpx::opencl::buffer -mandelbrotworker_buffermanager:: -get_buffer(size_t size) -{ - - // search for an already allocated buffer of the correct size - buffer_map_type::iterator it = buffers.find(size); +hpx::opencl::buffer mandelbrotworker_buffermanager::get_buffer(size_t size) { + // search for an already allocated buffer of the correct size + buffer_map_type::iterator it = buffers.find(size); - // if no buffer is found, allocate a new one - if(it == buffers.end()) - { - allocate_buffer(size); - it = buffers.find(size); - } + // if no buffer is found, allocate a new one + if (it == buffers.end()) { + allocate_buffer(size); + it = buffers.find(size); + } - // make sure that we now have a buffer - BOOST_ASSERT(it != buffers.end()); - - // return the buffer - return it->second; + // make sure that we now have a buffer + BOOST_ASSERT(it != buffers.end()); + // return the buffer + return it->second; } +void mandelbrotworker_buffermanager::allocate_buffer(size_t size) { + if (verbose) + hpx::cout << "allocating opencl buffer of size " << size << " bytes ..." + << hpx::endl; + // make sure no buffer of the given size already exists + BOOST_ASSERT(buffers.find(size) == buffers.end()); -void -mandelbrotworker_buffermanager:: -allocate_buffer(size_t size) -{ - - if(verbose) hpx::cout << "allocating opencl buffer of size " - << size << " bytes ..." << hpx::endl; - - // make sure no buffer of the given size already exists - BOOST_ASSERT(buffers.find(size) == buffers.end()); - - // allocate a buffer - hpx::opencl::buffer new_buffer = - device.create_buffer(memflags, size); - - // add the buffer to the map - buffers.insert( std::pair(size, new_buffer) ); + // allocate a buffer + hpx::opencl::buffer new_buffer = device.create_buffer(memflags, size); + // add the buffer to the map + buffers.insert(std::pair(size, new_buffer)); } - diff --git a/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.hpp b/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.hpp index fdedaefc..4c49d5f1 100644 --- a/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.hpp +++ b/examples/opencl/mandelbrot/mandelbrotworker_buffermanager.hpp @@ -10,38 +10,32 @@ #include -/* +/* * a worker. * will ask the workqueue for new work until the workqueue finishes. * this is the only class that actually uses the hpxcl. */ -class mandelbrotworker_buffermanager -{ - - public: - // initializes the buffermanager - mandelbrotworker_buffermanager(hpx::opencl::device device_, - size_t initial_buffer_size, - bool verbose, - cl_mem_flags memflags); - - // get a buffer - hpx::opencl::buffer - get_buffer(size_t buffersize); - - // private functions - private: - void allocate_buffer(size_t size); - - - // private attributes - private: - hpx::opencl::device device; - typedef std::map buffer_map_type; - buffer_map_type buffers; - bool verbose; - cl_mem_flags memflags; +class mandelbrotworker_buffermanager { + public: + // initializes the buffermanager + mandelbrotworker_buffermanager(hpx::opencl::device device_, + size_t initial_buffer_size, bool verbose, + cl_mem_flags memflags); + + // get a buffer + hpx::opencl::buffer get_buffer(size_t buffersize); + + // private functions + private: + void allocate_buffer(size_t size); + + // private attributes + private: + hpx::opencl::device device; + typedef std::map buffer_map_type; + buffer_map_type buffers; + bool verbose; + cl_mem_flags memflags; }; #endif - diff --git a/examples/opencl/mandelbrot/maps/main_maps.cpp b/examples/opencl/mandelbrot/maps/main_maps.cpp index 2a4cdcc0..b77f40b6 100644 --- a/examples/opencl/mandelbrot/maps/main_maps.cpp +++ b/examples/opencl/mandelbrot/maps/main_maps.cpp @@ -17,110 +17,88 @@ #include "webserver.hpp" //#include "../maps_webserver.hpp" - #include #include -int hpx_main(boost::program_options::variables_map & vm) -{ - - std::size_t num_kernels = 0; - bool verbose = false; - - // Print help message on wrong argument count - if (vm.count("num-parallel-kernels")) - num_kernels = vm["num-parallel-kernels"].as(); - if (vm.count("v")) - verbose = true; - - // The main scope - { - - // get all devices - std::vector devices = - hpx::opencl::create_all_devices( - CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, - "OpenCL 1.1").get(); - - // Check whether there are any devices - if(devices.size() < 1) - { - hpx::cerr << "No OpenCL devices found!" << hpx::endl; - return hpx::finalize(); - } - else - { - hpx::cout << devices.size() << " OpenCL devices found!" << hpx::endl; - } - - - size_t tilesize_x = 256; - size_t tilesize_y = 256; - size_t lines_per_gpu = 32; - - // generate requesthandler, will order requests and convert coordinates - hpx::opencl::examples::mandelbrot::requesthandler requesthandler( - tilesize_x, - tilesize_y, - lines_per_gpu); - - - // create image_generator - hpx::opencl::examples::mandelbrot::maps_image_generator - img_gen(tilesize_x, - lines_per_gpu, - num_kernels, - verbose, - boost::bind( - &hpx::opencl::examples::mandelbrot::requesthandler::query_request, - &requesthandler), - devices); - - // wait for workers to finish initialization - if(verbose) hpx::cout << "waiting for workers to finish startup ..." << hpx::endl; - img_gen.wait_for_startup_finished(); - - hpx::cout << "Starting webservers ..." << hpx::endl; - - // generate webserver - hpx::opencl::examples::mandelbrot::webserver webserver(8080, - &requesthandler); - - // start the webserver - webserver.start(); - - while(true) - { - hpx::this_thread::sleep_for(std::chrono::milliseconds(1000)); - } - - webserver.stop(); - +int hpx_main(boost::program_options::variables_map& vm) { + std::size_t num_kernels = 0; + bool verbose = false; + + // Print help message on wrong argument count + if (vm.count("num-parallel-kernels")) + num_kernels = vm["num-parallel-kernels"].as(); + if (vm.count("v")) verbose = true; + + // The main scope + { + // get all devices + std::vector devices = + hpx::opencl::create_all_devices( + CL_DEVICE_TYPE_GPU | CL_DEVICE_TYPE_ACCELERATOR, "OpenCL 1.1") + .get(); + + // Check whether there are any devices + if (devices.size() < 1) { + hpx::cerr << "No OpenCL devices found!" << hpx::endl; + return hpx::finalize(); + } else { + hpx::cout << devices.size() << " OpenCL devices found!" << hpx::endl; } - if(verbose) hpx::cout << "Program finished." << hpx::endl; + size_t tilesize_x = 256; + size_t tilesize_y = 256; + size_t lines_per_gpu = 32; - // End the program - return hpx::finalize(); + // generate requesthandler, will order requests and convert coordinates + hpx::opencl::examples::mandelbrot::requesthandler requesthandler( + tilesize_x, tilesize_y, lines_per_gpu); -} + // create image_generator + hpx::opencl::examples::mandelbrot::maps_image_generator img_gen( + tilesize_x, lines_per_gpu, num_kernels, verbose, + boost::bind( + &hpx::opencl::examples::mandelbrot::requesthandler::query_request, + &requesthandler), + devices); + + // wait for workers to finish initialization + if (verbose) + hpx::cout << "waiting for workers to finish startup ..." << hpx::endl; + img_gen.wait_for_startup_finished(); + + hpx::cout << "Starting webservers ..." << hpx::endl; + + // generate webserver + hpx::opencl::examples::mandelbrot::webserver webserver(8080, + &requesthandler); + // start the webserver + webserver.start(); + while (true) { + hpx::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + + webserver.stop(); + } + + if (verbose) hpx::cout << "Program finished." << hpx::endl; + + // End the program + return hpx::finalize(); +} ////////////////////////////////////////////////////////////////////////////// -int main(int argc, char* argv[]) -{ - // Configure application-specific options - boost::program_options::options_description cmdline( - "Usage: " HPX_APPLICATION_STRING " [options]"); - cmdline.add_options() - ( "num-parallel-kernels" - , boost::program_options::value()->default_value(3) - , "the number of parallel kernel invocations per gpu") ; - - cmdline.add_options() - ( "v" - , "verbose output") ; - - return hpx::init(cmdline, argc, argv); +int main(int argc, char* argv[]) { + // Configure application-specific options + boost::program_options::options_description cmdline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + cmdline.add_options()( + "num-parallel-kernels", + boost::program_options::value()->default_value(3), + "the number of parallel kernel invocations per gpu"); + + cmdline.add_options()("v", "verbose output"); + + return hpx::init(cmdline, argc, argv); } diff --git a/examples/opencl/mandelbrot/maps/maps_image_generator.cpp b/examples/opencl/mandelbrot/maps/maps_image_generator.cpp index 9517d259..5215de4b 100644 --- a/examples/opencl/mandelbrot/maps/maps_image_generator.cpp +++ b/examples/opencl/mandelbrot/maps/maps_image_generator.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "maps_image_generator.hpp" #include @@ -17,512 +16,388 @@ #include "../pngwriter.hpp" #include "requesthandler.hpp" - using namespace hpx::opencl::examples::mandelbrot; -maps_image_generator:: -maps_image_generator(size_t img_size_hint_x_, - size_t img_size_hint_y_, - size_t num_parallel_kernels, - bool verbose_, - boost::function(void)> - acquire_new_request_, - std::vector devices) - : next_image_id(0), verbose(verbose_), - img_size_hint_x(img_size_hint_x_), - img_size_hint_y(img_size_hint_y_), - acquire_new_request(acquire_new_request_), - shutdown_requested(false) -{ - - // one retrieve worker for every os thread - size_t num_retrieve_workers = hpx::get_os_thread_count(); - - // starting workers - for( auto& device : devices) - { - - // add a worker - add_worker(device, num_parallel_kernels); - - } - - // starting retrievers - std::vector> retriever_futures; - for(size_t i = 0; i < num_retrieve_workers; i++) - { - - hpx::lcos::future retriever_future = - hpx::async(retrieve_worker_main, - (intptr_t) this, - verbose); - - retriever_futures.push_back(std::move(retriever_future)); - - } - - // combining all retrievers into one future - retrievers_finished = hpx::when_all(retriever_futures).share(); - - // start the first image fetch - start_getting_new_image(); +maps_image_generator::maps_image_generator( + size_t img_size_hint_x_, size_t img_size_hint_y_, + size_t num_parallel_kernels, bool verbose_, + boost::function(void)> acquire_new_request_, + std::vector devices) + : next_image_id(0), + verbose(verbose_), + img_size_hint_x(img_size_hint_x_), + img_size_hint_y(img_size_hint_y_), + acquire_new_request(acquire_new_request_), + shutdown_requested(false) { + // one retrieve worker for every os thread + size_t num_retrieve_workers = hpx::get_os_thread_count(); + + // starting workers + for (auto& device : devices) { + // add a worker + add_worker(device, num_parallel_kernels); + } + + // starting retrievers + std::vector> retriever_futures; + for (size_t i = 0; i < num_retrieve_workers; i++) { + hpx::lcos::future retriever_future = + hpx::async(retrieve_worker_main, (intptr_t)this, verbose); + + retriever_futures.push_back(std::move(retriever_future)); + } + + // combining all retrievers into one future + retrievers_finished = hpx::when_all(retriever_futures).share(); + + // start the first image fetch + start_getting_new_image(); } - -maps_image_generator:: -~maps_image_generator() -{ - - // wait for work to get finished - shutdown(); - +maps_image_generator::~maps_image_generator() { + // wait for work to get finished + shutdown(); } // current_request_lock must be locked BEFORE entering this function! -void -maps_image_generator:: -dispose_current_request_if_invalid() -{ - - if(current_request) - { - if(!current_request->stillValid()) - { - current_request->abort(); - - { - // lock the data list - boost::lock_guard - lock2(images_lock); - - // insert new request to images list - images.erase(current_request_id); - } +void maps_image_generator::dispose_current_request_if_invalid() { + if (current_request) { + if (!current_request->stillValid()) { + current_request->abort(); - current_request = std::shared_ptr(); - - start_getting_new_image(); - } - } - - -} - -bool -maps_image_generator:: -worker_request_new_work(std::shared_ptr* new_work) -{ - - // lock - boost::lock_guard - lock(current_request_lock); - - if(shutdown_requested) return false; - - if(verbose) hpx::cout << "started new work request" << hpx::endl; - - // test if current request is still valid - dispose_current_request_if_invalid(); - - // wait for new request if necessary - while(!current_request) - { - - if(verbose) hpx::cout << "no new work. waiting ..." << hpx::endl; - - new_request_available.wait(current_request_lock); - if(shutdown_requested) return false; + { + // lock the data list + boost::lock_guard lock2(images_lock); - dispose_current_request_if_invalid(); - if(verbose) hpx::cout << "new work! trying again ..." << hpx::endl; - } + // insert new request to images list + images.erase(current_request_id); + } - if(verbose) hpx::cout << "new work aquired. calculating new workload ..." << hpx::endl; - - // calculate current coords - double workpacket_pos_x = current_topleft_x - + current_vert_pixdist_x * current_img_pos * - current_request->lines_per_gpu; - double workpacket_pos_y = current_topleft_y - + current_vert_pixdist_y * current_img_pos * - current_request->lines_per_gpu; - - // TODO calculate new workload - *new_work = std::make_shared( - current_request->tilesize_x, - current_request->lines_per_gpu, - workpacket_pos_x, - workpacket_pos_y, - current_hor_pixdist_x, - current_hor_pixdist_y, - current_vert_pixdist_x, - current_vert_pixdist_y, - current_request_id, - 0, - current_img_pos*current_request->lines_per_gpu, - current_request->tilesize_x); - - // set the next position in image - current_img_pos++; - - // delete workload if we are the last bit - if(current_img_pos * current_request->lines_per_gpu >= - current_request->tilesize_y) - { - current_request = std::shared_ptr(); + current_request = std::shared_ptr(); - // fetch new image - start_getting_new_image(); + start_getting_new_image(); } - - return true; - + } } -void -maps_image_generator:: -start_getting_new_image() -{ +bool maps_image_generator::worker_request_new_work( + std::shared_ptr* new_work) { + // lock + boost::lock_guard lock(current_request_lock); - hpx::async(&maps_image_generator::get_new_image, this).then( - hpx::util::bind( + if (shutdown_requested) return false; - // if no new image can be fetched, shut down generator - [] ( - maps_image_generator* img_gen, - hpx::lcos::future parent_future - ) { + if (verbose) hpx::cout << "started new work request" << hpx::endl; - if(!parent_future.get()) - img_gen->shutdown(); + // test if current request is still valid + dispose_current_request_if_invalid(); - }, + // wait for new request if necessary + while (!current_request) { + if (verbose) hpx::cout << "no new work. waiting ..." << hpx::endl; - this, - hpx::util::placeholders::_1 + new_request_available.wait(current_request_lock); + if (shutdown_requested) return false; - )); + dispose_current_request_if_invalid(); + if (verbose) hpx::cout << "new work! trying again ..." << hpx::endl; + } + + if (verbose) + hpx::cout << "new work aquired. calculating new workload ..." << hpx::endl; + + // calculate current coords + double workpacket_pos_x = + current_topleft_x + + current_vert_pixdist_x * current_img_pos * current_request->lines_per_gpu; + double workpacket_pos_y = + current_topleft_y + + current_vert_pixdist_y * current_img_pos * current_request->lines_per_gpu; + + // TODO calculate new workload + *new_work = std::make_shared( + current_request->tilesize_x, current_request->lines_per_gpu, + workpacket_pos_x, workpacket_pos_y, current_hor_pixdist_x, + current_hor_pixdist_y, current_vert_pixdist_x, current_vert_pixdist_y, + current_request_id, 0, current_img_pos * current_request->lines_per_gpu, + current_request->tilesize_x); + + // set the next position in image + current_img_pos++; + + // delete workload if we are the last bit + if (current_img_pos * current_request->lines_per_gpu >= + current_request->tilesize_y) { + current_request = std::shared_ptr(); + + // fetch new image + start_getting_new_image(); + } + return true; } -void -maps_image_generator:: -worker_deliver(std::shared_ptr& done_work) -{ +void maps_image_generator::start_getting_new_image() { + hpx::async(&maps_image_generator::get_new_image, this) + .then(hpx::util::bind( + // if no new image can be fetched, shut down generator + [](maps_image_generator* img_gen, + hpx::lcos::future parent_future) { + if (!parent_future.get()) img_gen->shutdown(); + }, - if(verbose) hpx::cout << "got delivery from worker." << hpx::endl; - done_work_queue.push(done_work); - if(verbose) hpx::cout << "finished delivery from worker." << hpx::endl; + this, hpx::util::placeholders::_1 + )); } -void -maps_image_generator:: -add_worker(hpx::opencl::device & device, size_t num_parallel_kernels) -{ - - // create request callback function for worker - boost::function*)> request_new_work = - boost::bind(&maps_image_generator::worker_request_new_work, - this, - _1); - - // create deliver callback function for worker - boost::function&)> deliver_done_work = - boost::bind(&maps_image_generator::worker_deliver, - this, - _1); - - - // create worker - std::shared_ptr worker = - std::make_shared - (device, - num_parallel_kernels, - request_new_work, - deliver_done_work, - verbose, - img_size_hint_x, - img_size_hint_y); - - // add worker to workerlist - workers.push_back(worker); - +void maps_image_generator::worker_deliver( + std::shared_ptr& done_work) { + if (verbose) hpx::cout << "got delivery from worker." << hpx::endl; + done_work_queue.push(done_work); + if (verbose) hpx::cout << "finished delivery from worker." << hpx::endl; } -bool -maps_image_generator:: -get_new_image() -{ - - // lock the access tu the current request - boost::lock_guard - lock(current_request_lock); +void maps_image_generator::add_worker(hpx::opencl::device& device, + size_t num_parallel_kernels) { + // create request callback function for worker + boost::function*)> request_new_work = + boost::bind(&maps_image_generator::worker_request_new_work, this, _1); - // don't do anything if there still is a current request - if(current_request) - return true; + // create deliver callback function for worker + boost::function&)> deliver_done_work = + boost::bind(&maps_image_generator::worker_deliver, this, _1); - // get new request - std::shared_ptr new_request = acquire_new_request(); + // create worker + std::shared_ptr worker = std::make_shared( + device, num_parallel_kernels, request_new_work, deliver_done_work, + verbose, img_size_hint_x, img_size_hint_y); - // shutdown if acquire_new_request returned an invalid value - if(!new_request) return false; + // add worker to workerlist + workers.push_back(worker); +} - // if image is already dead, abort image and query another one - if(!new_request->stillValid()) - { - new_request->abort(); - start_getting_new_image(); - return true; - } +bool maps_image_generator::get_new_image() { + // lock the access tu the current request + boost::lock_guard lock(current_request_lock); - // allocate image data - new_request->data = std::make_shared>( - new_request->tilesize_x - * new_request->tilesize_y - * 3 * sizeof(char)); + // don't do anything if there still is a current request + if (current_request) return true; - // get new image id - size_t new_image_id = next_image_id++; + // get new request + std::shared_ptr new_request = acquire_new_request(); - { - // lock the data list - boost::lock_guard - lock2(images_lock); + // shutdown if acquire_new_request returned an invalid value + if (!new_request) return false; - // insert new request to images list - images.insert(std::pair>(new_image_id, - new_request)); - } + // if image is already dead, abort image and query another one + if (!new_request->stillValid()) { + new_request->abort(); + start_getting_new_image(); + return true; + } - // set as current request - current_request = new_request; + // allocate image data + new_request->data = std::make_shared>( + new_request->tilesize_x * new_request->tilesize_y * 3 * sizeof(char)); - // set current request id - current_request_id = new_image_id; + // get new image id + size_t new_image_id = next_image_id++; - // set current image position - current_img_pos = 0; + { + // lock the data list + boost::lock_guard lock2(images_lock); - /////////////////////////////////////////////// - // map raw values to double values + // insert new request to images list + images.insert( + std::pair>(new_image_id, new_request)); + } - // calculate actual zoom - double zoom = exp2((double)new_request->zoom); + // set as current request + current_request = new_request; - // calculate sidelength - double sqrt_2 = sqrt(2.0); - double tilesidelength = (4.0/sqrt_2) / zoom; + // set current request id + current_request_id = new_image_id; - // calculate actual positions - double bound = exp2(new_request->zoom); - double posx = (new_request->posx - bound/2.0 + 0.5) * tilesidelength; - double posy = -(new_request->posy - bound/2.0 + 0.5) * tilesidelength; + // set current image position + current_img_pos = 0; - /////////////////////////////////////////// - // calculate image coords + /////////////////////////////////////////////// + // map raw values to double values - size_t img_width = new_request->tilesize_x; - size_t img_height = new_request->tilesize_y; + // calculate actual zoom + double zoom = exp2((double)new_request->zoom); - // calculate aspect ratio - double aspect_ratio = (double) img_width - / (double) img_height; + // calculate sidelength + double sqrt_2 = sqrt(2.0); + double tilesidelength = (4.0 / sqrt_2) / zoom; - // calculate size of diagonale - //double size_diag = exp2(-zoom) * 4.0; - double size_diag = 4.0 / zoom; + // calculate actual positions + double bound = exp2(new_request->zoom); + double posx = (new_request->posx - bound / 2.0 + 0.5) * tilesidelength; + double posy = -(new_request->posy - bound / 2.0 + 0.5) * tilesidelength; - // calculate width and height - double size_y = size_diag / sqrt( 1 + aspect_ratio * aspect_ratio ); - double size_x = aspect_ratio * size_y; + /////////////////////////////////////////// + // calculate image coords - // calculate horizontal stepwidth - double rotation = 0.0; - double hor_pixdist_nonrot = size_x / img_width; - current_hor_pixdist_x = cos(rotation) * hor_pixdist_nonrot; - current_hor_pixdist_y = sin(rotation) * hor_pixdist_nonrot; + size_t img_width = new_request->tilesize_x; + size_t img_height = new_request->tilesize_y; - // calculate vertical stepwidth - double vert_pixdist_nonrot = - size_y / img_height; - current_vert_pixdist_x = - sin(rotation) * vert_pixdist_nonrot; - current_vert_pixdist_y = cos(rotation) * vert_pixdist_nonrot; + // calculate aspect ratio + double aspect_ratio = (double)img_width / (double)img_height; + // calculate size of diagonale + // double size_diag = exp2(-zoom) * 4.0; + double size_diag = 4.0 / zoom; - // calculate top left coords - current_topleft_x = posx - current_hor_pixdist_x * ( img_width / 2.0 - 0.5 ) - - current_vert_pixdist_x * ( img_height / 2.0 - 0.5 ); - current_topleft_y = posy - current_hor_pixdist_y * ( img_width / 2.0 - 0.5 ) - - current_vert_pixdist_y * ( img_height / 2.0 - 0.5 ); + // calculate width and height + double size_y = size_diag / sqrt(1 + aspect_ratio * aspect_ratio); + double size_x = aspect_ratio * size_y; + // calculate horizontal stepwidth + double rotation = 0.0; + double hor_pixdist_nonrot = size_x / img_width; + current_hor_pixdist_x = cos(rotation) * hor_pixdist_nonrot; + current_hor_pixdist_y = sin(rotation) * hor_pixdist_nonrot; - // signal waiting threads - new_request_available.notify_all(); + // calculate vertical stepwidth + double vert_pixdist_nonrot = -size_y / img_height; + current_vert_pixdist_x = -sin(rotation) * vert_pixdist_nonrot; + current_vert_pixdist_y = cos(rotation) * vert_pixdist_nonrot; - hpx::cout << "Started working on " << current_request->zoom - << " - (" << posx << ", " << posy << ")" << hpx::endl; + // calculate top left coords + current_topleft_x = posx - current_hor_pixdist_x * (img_width / 2.0 - 0.5) - + current_vert_pixdist_x * (img_height / 2.0 - 0.5); + current_topleft_y = posy - current_hor_pixdist_y * (img_width / 2.0 - 0.5) - + current_vert_pixdist_y * (img_height / 2.0 - 0.5); + // signal waiting threads + new_request_available.notify_all(); - return true; + hpx::cout << "Started working on " << current_request->zoom << " - (" << posx + << ", " << posy << ")" << hpx::endl; + return true; } -void -maps_image_generator:: -wait_for_startup_finished() -{ - - // wait for all workers to finish startup - for( auto& worker : workers) - { - - worker->wait_for_startup_finished(); - - } - - +void maps_image_generator::wait_for_startup_finished() { + // wait for all workers to finish startup + for (auto& worker : workers) { + worker->wait_for_startup_finished(); + } } -void -maps_image_generator:: -shutdown() -{ +void maps_image_generator::shutdown() { + // set shutdown requested flag + shutdown_requested = true; - // set shutdown requested flag - shutdown_requested = true; + // signal workers to continue working, they will then read the + // shutdown_requested flag and end + new_request_available.notify_all(); - // signal workers to continue working, they will then read the - // shutdown_requested flag and end - new_request_available.notify_all(); + // wait for all workers to finish + for (auto& worker : workers) { + worker->join(); + } - // wait for all workers to finish - for( auto& worker : workers) - { - worker->join(); - } - - // then, signal the retrievers to shutdown - done_work_queue.finish(); - - // wait for retrievers to finish - retrievers_finished.wait(); + // then, signal the retrievers to shutdown + done_work_queue.finish(); + // wait for retrievers to finish + retrievers_finished.wait(); } -void -maps_image_generator:: -retrieve_worker_main(intptr_t parent_, bool verbose) -{ - - // get parent pointer - maps_image_generator* parent = (maps_image_generator*) parent_; +void maps_image_generator::retrieve_worker_main(intptr_t parent_, + bool verbose) { + // get parent pointer + maps_image_generator* parent = (maps_image_generator*)parent_; - // represents done workload - std::shared_ptr done_workload; + // represents done workload + std::shared_ptr done_workload; - // main loop - if(verbose) hpx::cout << "entering retrieve worker main loop ..." << hpx::endl; - while(parent->done_work_queue.pop(&done_workload)) - { - - if(verbose) hpx::cout << "retrieved workload " - << done_workload->pos_in_img_x - << ":" - << done_workload->pos_in_img_y - << hpx::endl; - - // retrieve id of associated image - size_t img_id = done_workload->img_id; - - // image data - std::shared_ptr img_request; + // main loop + if (verbose) + hpx::cout << "entering retrieve worker main loop ..." << hpx::endl; + while (parent->done_work_queue.pop(&done_workload)) { + if (verbose) + hpx::cout << "retrieved workload " << done_workload->pos_in_img_x << ":" + << done_workload->pos_in_img_y << hpx::endl; - // retrieve image pointers - { - // lock - boost::lock_guard - lock(parent->images_lock); - - // try to find the associated request - image_request_map::iterator req_iterator = parent->images.find(img_id); - // indicates that the image is gone. don't handle data in this case. - if(req_iterator == parent->images.end()) - continue; - - // read the request - img_request = req_iterator->second; - } + // retrieve id of associated image + size_t img_id = done_workload->img_id; - // copy data to img_data - size_t start_x = done_workload->pos_in_img_x; - size_t start_y = done_workload->pos_in_img_y; - size_t size_x = done_workload->num_pixels_x; - size_t size_y = done_workload->num_pixels_y; - size_t line_offset = done_workload->line_offset; - for(size_t y = 0; y < size_y; y++) - { - for(size_t x = 0; x < size_x; x++) - { - (*(img_request->data)) - [((y + start_y) * line_offset + (x + start_x)) * 3 + 0] = - done_workload->pixeldata[(y * size_x + x) * 3 + 0]; - (*(img_request->data)) - [((y + start_y) * line_offset + (x + start_x)) * 3 + 1] = - done_workload->pixeldata[(y * size_x + x) * 3 + 1]; - (*(img_request->data)) - [((y + start_y) * line_offset + (x + start_x)) * 3 + 2] = - done_workload->pixeldata[(y * size_x + x) * 3 + 2]; - } - } + // image data + std::shared_ptr img_request; - // decrease the number of work packets left - size_t current_img_countdown = --(img_request->img_countdown); - - // if no work packet left (img finished), then: - if(current_img_countdown == 0) - { + // retrieve image pointers + { + // lock + boost::lock_guard lock(parent->images_lock); - // convert to png - if(img_request->stillValid()) - { - size_t png_size; - boost::shared_array png_data = - create_png(img_request->data, - img_request->tilesize_x, - img_request->tilesize_y, - &png_size); - - // remove old data - img_request->data = std::make_shared> - (png_data.get(), - png_data.get() + png_size); - - // send data - img_request->done(img_request->data); - } else { - img_request->abort(); - } - - // lock the data lists - boost::lock_guard - lock(parent->images_lock); - - // remove image data. - // data will still be available for waiting image thread, - // as it is a shared_ptr. - image_request_map::iterator img_it = parent->images.find(img_id); - if(img_it != parent->images.end()) - parent->images.erase(img_it); + // try to find the associated request + image_request_map::iterator req_iterator = parent->images.find(img_id); + // indicates that the image is gone. don't handle data in this case. + if (req_iterator == parent->images.end()) continue; - } + // read the request + img_request = req_iterator->second; + } + // copy data to img_data + size_t start_x = done_workload->pos_in_img_x; + size_t start_y = done_workload->pos_in_img_y; + size_t size_x = done_workload->num_pixels_x; + size_t size_y = done_workload->num_pixels_y; + size_t line_offset = done_workload->line_offset; + for (size_t y = 0; y < size_y; y++) { + for (size_t x = 0; x < size_x; x++) { + (*(img_request + ->data))[((y + start_y) * line_offset + (x + start_x)) * 3 + 0] = + done_workload->pixeldata[(y * size_x + x) * 3 + 0]; + (*(img_request + ->data))[((y + start_y) * line_offset + (x + start_x)) * 3 + 1] = + done_workload->pixeldata[(y * size_x + x) * 3 + 1]; + (*(img_request + ->data))[((y + start_y) * line_offset + (x + start_x)) * 3 + 2] = + done_workload->pixeldata[(y * size_x + x) * 3 + 2]; + } } + // decrease the number of work packets left + size_t current_img_countdown = --(img_request->img_countdown); + + // if no work packet left (img finished), then: + if (current_img_countdown == 0) { + // convert to png + if (img_request->stillValid()) { + size_t png_size; + boost::shared_array png_data = + create_png(img_request->data, img_request->tilesize_x, + img_request->tilesize_y, &png_size); + + // remove old data + img_request->data = std::make_shared>( + png_data.get(), png_data.get() + png_size); + + // send data + img_request->done(img_request->data); + } else { + img_request->abort(); + } + + // lock the data lists + boost::lock_guard lock(parent->images_lock); + + // remove image data. + // data will still be available for waiting image thread, + // as it is a shared_ptr. + image_request_map::iterator img_it = parent->images.find(img_id); + if (img_it != parent->images.end()) parent->images.erase(img_it); + } + } } - /* // waits until event lock triggered, then returns data std::shared_ptr> @@ -606,12 +481,16 @@ compute_image(double posx, if(verbose){ hpx::cout << "image data" << hpx::endl << "topleft: " << topleft_x << ":" << topleft_y << hpx::endl - << "img_dims: " << img_width << ":" << img_height << hpx::endl + << "img_dims: " << img_width << ":" << img_height << +hpx::endl << "pos: " << posx << ":" << posy << hpx::endl << "size: " << size_x << ":" << size_y << hpx::endl - << "hor_pixdist: " << hor_pixdist_x << ":" << hor_pixdist_y << hpx::endl - << "vert_pixdist: " << vert_pixdist_x << ":" << vert_pixdist_y << hpx::endl - << "num_tiles: " << num_tiles_x << ":" << num_tiles_y << hpx::endl; + << "hor_pixdist: " << hor_pixdist_x << ":" << hor_pixdist_y << +hpx::endl + << "vert_pixdist: " << vert_pixdist_x << ":" << vert_pixdist_y << +hpx::endl + << "num_tiles: " << num_tiles_x << ":" << num_tiles_y << +hpx::endl; } // create data array to hold finished image, if we are not in benchmark mode @@ -652,11 +531,13 @@ compute_image(double posx, { for(size_t x = 0; x < img_width; x += tile_width) { - if (verbose) hpx::cout << "\tAdding workload " << x << ":" << y << " ..." << hpx::endl; + if (verbose) hpx::cout << "\tAdding workload " << x << ":" << y << " +..." << hpx::endl; // calculate position of current work packet - double workpacket_pos_x = topleft_x + vert_pixdist_x * y + hor_pixdist_x * x; - double workpacket_pos_y = topleft_y + vert_pixdist_y * y + hor_pixdist_y * x; + double workpacket_pos_x = topleft_x + vert_pixdist_x * y + +hor_pixdist_x * x; double workpacket_pos_y = topleft_y + vert_pixdist_y * y + +hor_pixdist_y * x; // add workload std::shared_ptr row = std::make_shared(tile_width, diff --git a/examples/opencl/mandelbrot/maps/maps_image_generator.hpp b/examples/opencl/mandelbrot/maps/maps_image_generator.hpp index 105772b9..22ca0c15 100644 --- a/examples/opencl/mandelbrot/maps/maps_image_generator.hpp +++ b/examples/opencl/mandelbrot/maps/maps_image_generator.hpp @@ -24,104 +24,100 @@ * this class is the main observer of the image generation. * it gets image queries, which it then splits into subimages and sends it to * the calculation queue. - * it then collects the calculated data and puts it together, to have one finished image. + * it then collects the calculated data and puts it together, to have one + * finished image. */ - -namespace hpx { namespace opencl { namespace examples { namespace mandelbrot { +namespace hpx { +namespace opencl { +namespace examples { +namespace mandelbrot { struct request; -class maps_image_generator -{ - - public: - // initializes the image generator - maps_image_generator(size_t img_size_hint_x, - size_t img_size_hint_y, - size_t num_parallel_kernels, - bool verbose, - boost::function(void)> - acquire_new_request, - std::vector devices - = std::vector()); +class maps_image_generator { + public: + // initializes the image generator + maps_image_generator( + size_t img_size_hint_x, size_t img_size_hint_y, + size_t num_parallel_kernels, bool verbose, + boost::function(void)> acquire_new_request, + std::vector devices = + std::vector()); - // destructor - ~maps_image_generator(); + // destructor + ~maps_image_generator(); - // waits for the worker to finish - void shutdown(); + // waits for the worker to finish + void shutdown(); - // adds a worker - void add_worker(hpx::opencl::device & device, - size_t num_parallel_kernels); + // adds a worker + void add_worker(hpx::opencl::device& device, size_t num_parallel_kernels); - // waits for the worker to finish initialization - void wait_for_startup_finished(); + // waits for the worker to finish initialization + void wait_for_startup_finished(); - private: - // the main worker function, runs the main work loop - static void retrieve_worker_main( - intptr_t parent_, - bool verbose); + private: + // the main worker function, runs the main work loop + static void retrieve_worker_main(intptr_t parent_, bool verbose); - // callback for mandelbrotworkers - bool worker_request_new_work(std::shared_ptr* new_work); + // callback for mandelbrotworkers + bool worker_request_new_work(std::shared_ptr* new_work); - // callback for mandelbrotworkers - void worker_deliver(std::shared_ptr& done_work); + // callback for mandelbrotworkers + void worker_deliver(std::shared_ptr& done_work); - // queries a new image. true on success, false on error. - bool get_new_image(); + // queries a new image. true on success, false on error. + bool get_new_image(); - // asynchroneously starts get_new_image(). - void start_getting_new_image(); + // asynchroneously starts get_new_image(). + void start_getting_new_image(); - // tests the current request, if it's invalid it deletes it and queries - // a new image - void dispose_current_request_if_invalid(); + // tests the current request, if it's invalid it deletes it and queries + // a new image + void dispose_current_request_if_invalid(); - // private attributes - private: - // for synchronization of workers and retrievers - hpx::lcos::shared_future retrievers_finished; - std::vector> workers; + // private attributes + private: + // for synchronization of workers and retrievers + hpx::lcos::shared_future retrievers_finished; + std::vector> workers; - // the actual image data - typedef std::map> - image_request_map; - image_request_map images; - hpx::lcos::local::spinlock images_lock; + // the actual image data + typedef std::map> image_request_map; + image_request_map images; + hpx::lcos::local::spinlock images_lock; - std::atomic next_image_id; + std::atomic next_image_id; - // other stuff - bool verbose; + // other stuff + bool verbose; - size_t img_size_hint_x; - size_t img_size_hint_y; + size_t img_size_hint_x; + size_t img_size_hint_y; - fifo> done_work_queue; + fifo> done_work_queue; - boost::function(void)> acquire_new_request; + boost::function(void)> acquire_new_request; - hpx::lcos::local::spinlock current_request_lock; - std::shared_ptr current_request; - hpx::lcos::local::condition_variable_any new_request_available; - size_t current_request_id; - size_t current_img_pos; - double current_topleft_x; - double current_topleft_y; - double current_vert_pixdist_x; - double current_vert_pixdist_y; - double current_hor_pixdist_x; - double current_hor_pixdist_y; - - volatile bool shutdown_requested; + hpx::lcos::local::spinlock current_request_lock; + std::shared_ptr current_request; + hpx::lcos::local::condition_variable_any new_request_available; + size_t current_request_id; + size_t current_img_pos; + double current_topleft_x; + double current_topleft_y; + double current_vert_pixdist_x; + double current_vert_pixdist_y; + double current_hor_pixdist_x; + double current_hor_pixdist_y; + volatile bool shutdown_requested; }; -} } } } +} // namespace mandelbrot +} // namespace examples +} // namespace opencl +} // namespace hpx #endif - diff --git a/examples/opencl/mandelbrot/maps/requesthandler.cpp b/examples/opencl/mandelbrot/maps/requesthandler.cpp index 3d738864..22ab0b5a 100644 --- a/examples/opencl/mandelbrot/maps/requesthandler.cpp +++ b/examples/opencl/mandelbrot/maps/requesthandler.cpp @@ -9,59 +9,40 @@ using namespace hpx::opencl::examples::mandelbrot; -requesthandler::requesthandler(size_t tilesize_x_, - size_t tilesize_y_, - size_t lines_per_gpu_) : - tilesize_x(tilesize_x_), - tilesize_y(tilesize_y_), - lines_per_gpu(lines_per_gpu_) -{ - - -} - -void -requesthandler::submit_request(std::shared_ptr request) -{ - - // Check if still valid - if(!request->stillValid()) - { - request->abort(); - return; - } - - // add missing data in request - request->tilesize_x = tilesize_x; - request->tilesize_y = tilesize_y; - request->lines_per_gpu = lines_per_gpu; - request->img_countdown = tilesize_y/lines_per_gpu; - - hpx::cout << "Request submitted: " << request->zoom << hpx::endl; - - // hand the request to an hpx thread - new_requests.push(request); - +requesthandler::requesthandler(size_t tilesize_x_, size_t tilesize_y_, + size_t lines_per_gpu_) + : tilesize_x(tilesize_x_), + tilesize_y(tilesize_y_), + lines_per_gpu(lines_per_gpu_) {} + +void requesthandler::submit_request(std::shared_ptr request) { + // Check if still valid + if (!request->stillValid()) { + request->abort(); + return; + } + + // add missing data in request + request->tilesize_x = tilesize_x; + request->tilesize_y = tilesize_y; + request->lines_per_gpu = lines_per_gpu; + request->img_countdown = tilesize_y / lines_per_gpu; + + hpx::cout << "Request submitted: " << request->zoom << hpx::endl; + + // hand the request to an hpx thread + new_requests.push(request); } -std::shared_ptr -requesthandler::query_request() -{ - - std::shared_ptr ret; +std::shared_ptr requesthandler::query_request() { + std::shared_ptr ret; - // take a new request out of the queue - while(true) - { - if(!new_requests.pop(&ret)) - return std::shared_ptr(); - if(ret->stillValid()) - { - return ret; - } - ret->abort(); + // take a new request out of the queue + while (true) { + if (!new_requests.pop(&ret)) return std::shared_ptr(); + if (ret->stillValid()) { + return ret; } - + ret->abort(); + } } - - diff --git a/examples/opencl/mandelbrot/maps/requesthandler.hpp b/examples/opencl/mandelbrot/maps/requesthandler.hpp index 238bd93e..ba08b994 100644 --- a/examples/opencl/mandelbrot/maps/requesthandler.hpp +++ b/examples/opencl/mandelbrot/maps/requesthandler.hpp @@ -14,51 +14,46 @@ #include #include -namespace hpx { namespace opencl { namespace examples { namespace mandelbrot { - -struct request -{ -public: - boost::function stillValid; - boost::function>)> done; - boost::function abort; - long zoom; - long posx; - long posy; - std::string user_ip; - std::shared_ptr> data; - size_t tilesize_x; - size_t tilesize_y; - size_t lines_per_gpu; - std::atomic img_countdown; +namespace hpx { +namespace opencl { +namespace examples { +namespace mandelbrot { + +struct request { + public: + boost::function stillValid; + boost::function>)> done; + boost::function abort; + long zoom; + long posx; + long posy; + std::string user_ip; + std::shared_ptr> data; + size_t tilesize_x; + size_t tilesize_y; + size_t lines_per_gpu; + std::atomic img_countdown; }; +class requesthandler { + public: + // constructor + requesthandler(size_t tilesize_x_, size_t tilesize_y_, size_t lines_per_gpu); -class requesthandler -{ - -public: - // constructor - requesthandler(size_t tilesize_x_, - size_t tilesize_y_, - size_t lines_per_gpu); - - void submit_request(std::shared_ptr request); - - std::shared_ptr query_request(); - - -private: - size_t tilesize_x; - size_t tilesize_y; - size_t lines_per_gpu; - fifo> new_requests; + void submit_request(std::shared_ptr request); + std::shared_ptr query_request(); + private: + size_t tilesize_x; + size_t tilesize_y; + size_t lines_per_gpu; + fifo> new_requests; }; - - -} } } } +} // namespace mandelbrot +} // namespace examples +} // namespace opencl +} // namespace hpx #endif diff --git a/examples/opencl/mandelbrot/maps/resources/resources.hpp b/examples/opencl/mandelbrot/maps/resources/resources.hpp index e090c90f..431d7c13 100644 --- a/examples/opencl/mandelbrot/maps/resources/resources.hpp +++ b/examples/opencl/mandelbrot/maps/resources/resources.hpp @@ -6,9 +6,9 @@ #ifndef MANDELBROT_MAPS_RESOURCES_HPP_ #define MANDELBROT_MAPS_RESOURCES_HPP_ - extern const char mandelbrot_html[]; - extern const unsigned long mandelbrot_html_len; - extern const char mandelbrot_ico[]; - extern const unsigned long mandelbrot_ico_len; +extern const char mandelbrot_html[]; +extern const unsigned long mandelbrot_html_len; +extern const char mandelbrot_ico[]; +extern const unsigned long mandelbrot_ico_len; #endif diff --git a/examples/opencl/mandelbrot/maps/webserver.cpp b/examples/opencl/mandelbrot/maps/webserver.cpp index 0dfeaa99..90937d04 100644 --- a/examples/opencl/mandelbrot/maps/webserver.cpp +++ b/examples/opencl/mandelbrot/maps/webserver.cpp @@ -22,558 +22,444 @@ static size_t num_requests = 0; static size_t num_answers = 0; static size_t num_aborted = 0; -webserver::webserver(unsigned short port, requesthandler * req_handler_) : - req_handler(req_handler_), - io_service(), - acceptor(io_service, tcp::endpoint(tcp::v4(), port)), - strand(io_service) -{ - - - -} - +webserver::webserver(unsigned short port, requesthandler* req_handler_) + : req_handler(req_handler_), + io_service(), + acceptor(io_service, tcp::endpoint(tcp::v4(), port)), + strand(io_service) {} // This struct is used for automatic registration and unregistration of // non-hpx threads -struct registration_wrapper -{ - registration_wrapper(hpx::runtime* rt, char const* name) - : rt_(rt) - { - // Register this thread with HPX, this should be done once for - // each external OS-thread intended to invoke HPX functionality. - // Calling this function more than once will silently fail (will - // return false). - rt_->register_thread(name); - } - ~registration_wrapper() - { - // Unregister the thread from HPX, this should be done once in the - // end before the external thread exists. - rt_->unregister_thread(); - } - - hpx::runtime* rt_; +struct registration_wrapper { + registration_wrapper(hpx::runtime* rt, char const* name) : rt_(rt) { + // Register this thread with HPX, this should be done once for + // each external OS-thread intended to invoke HPX functionality. + // Calling this function more than once will silently fail (will + // return false). + rt_->register_thread(name); + } + ~registration_wrapper() { + // Unregister the thread from HPX, this should be done once in the + // end before the external thread exists. + rt_->unregister_thread(); + } + + hpx::runtime* rt_; }; -void -webserver::dont_close_socket(boost::any keepalive_data) -{ - std::cout << "Packets: " << num_answers << "/" << num_aborted - << "/" << num_requests << " - " - << (num_requests - num_answers - num_aborted) << " lost" - << std::endl; +void webserver::dont_close_socket(boost::any keepalive_data) { + std::cout << "Packets: " << num_answers << "/" << num_aborted << "/" + << num_requests << " - " + << (num_requests - num_answers - num_aborted) << " lost" + << std::endl; } -void -webserver::close_socket(std::shared_ptr socket, - boost::any keepalive_data) -{ - - socket->close(); - +void webserver::close_socket(std::shared_ptr socket, + boost::any keepalive_data) { + socket->close(); } -void -webserver::send_server_error_and_close(std::shared_ptr socket) -{ - - //std::cout << "aborted" << std::endl; - num_aborted ++; - - std::shared_ptr response = std::make_shared( - "HTTP/1.0 500 Server Error\r\n" - "Connection: Close\r\n" - "\r\n"); +void webserver::send_server_error_and_close( + std::shared_ptr socket) { + // std::cout << "aborted" << std::endl; + num_aborted++; - boost::asio::async_write(*socket, - boost::asio::buffer(*response), - strand.wrap(boost::bind(&webserver::close_socket, - this, - socket, - response))); + std::shared_ptr response = std::make_shared( + "HTTP/1.0 500 Server Error\r\n" + "Connection: Close\r\n" + "\r\n"); + boost::asio::async_write(*socket, boost::asio::buffer(*response), + strand.wrap(boost::bind(&webserver::close_socket, + this, socket, response))); } -void -webserver::send_not_found_and_close(std::shared_ptr socket) -{ - - std::shared_ptr response = std::make_shared( - "HTTP/1.0 404 Not Found\r\n" - "Connection: Close\r\n" - "\r\n"); - - boost::asio::async_write(*socket, - boost::asio::buffer(*response), - strand.wrap(boost::bind(&webserver::close_socket, - this, - socket, - response))); +void webserver::send_not_found_and_close(std::shared_ptr socket) { + std::shared_ptr response = std::make_shared( + "HTTP/1.0 404 Not Found\r\n" + "Connection: Close\r\n" + "\r\n"); + boost::asio::async_write(*socket, boost::asio::buffer(*response), + strand.wrap(boost::bind(&webserver::close_socket, + this, socket, response))); } -void -webserver::send_bad_request_and_close(std::shared_ptr socket) -{ - - std::shared_ptr response = std::make_shared( - "HTTP/1.0 400 Bad Request\r\n" - "Connection: Close\r\n" - "\r\n"); - - boost::asio::async_write(*socket, - boost::asio::buffer(*response), - strand.wrap(boost::bind(&webserver::close_socket, - this, - socket, - response))); +void webserver::send_bad_request_and_close( + std::shared_ptr socket) { + std::shared_ptr response = std::make_shared( + "HTTP/1.0 400 Bad Request\r\n" + "Connection: Close\r\n" + "\r\n"); + boost::asio::async_write(*socket, boost::asio::buffer(*response), + strand.wrap(boost::bind(&webserver::close_socket, + this, socket, response))); } -std::string -webserver::read_filename_from_request(std::string line) -{ - - // vars - const std::string string_GET("GET "); - const std::string string_HTTP("HTTP/"); - size_t pos; - - // if request doesn't start with "GET ", send error - if(line.compare(0, string_GET.length(), string_GET) != 0) - { - // send error - return ""; - } - - // cut away "GET " - line = line.substr(string_GET.length()); - - // find the next space - pos = line.find(' '); - if(pos == line.npos) - { - // send error - return ""; - } - - // take request filename - std::string filename = line.substr(0, pos); - - // cut away filename - line = line.substr(pos + 1); - - // ensure it's a http request - if(line.compare(0, string_HTTP.length(), string_HTTP) != 0) - { - // send error - return ""; - } - - return filename; +std::string webserver::read_filename_from_request(std::string line) { + // vars + const std::string string_GET("GET "); + const std::string string_HTTP("HTTP/"); + size_t pos; + + // if request doesn't start with "GET ", send error + if (line.compare(0, string_GET.length(), string_GET) != 0) { + // send error + return ""; + } + + // cut away "GET " + line = line.substr(string_GET.length()); + + // find the next space + pos = line.find(' '); + if (pos == line.npos) { + // send error + return ""; + } + + // take request filename + std::string filename = line.substr(0, pos); + + // cut away filename + line = line.substr(pos + 1); + + // ensure it's a http request + if (line.compare(0, string_HTTP.length(), string_HTTP) != 0) { + // send error + return ""; + } + + return filename; } -struct send_data_data -{ - std::string header; - std::string footer; - std::shared_ptr> data; - std::vector buffers; +struct send_data_data { + std::string header; + std::string footer; + std::shared_ptr> data; + std::vector buffers; }; -void -webserver::send_data(std::shared_ptr socket, - const char* content_type, - std::shared_ptr> data) -{ - - num_answers ++; - - // store all the data that we need to keep alive - std::shared_ptr keep_alive_data = std::make_shared(); - - // keep data ptr alive - keep_alive_data->data = data; - - // vector to keep data alive - std::shared_ptr> keep_= std::make_shared>(); - - // generate header - std::stringstream ss; - ss << "HTTP/1.0 200 OK\r\n" - << "Content-Type: " << content_type << "\r\n" - << "Content-Length: " << data->size() << "\r\n" - << "Connection: Keep-Alive\r\n" - //<< "Connection: Close\r\n" - << "\r\n"; - keep_alive_data->header = ss.str(); - - // generate footer - keep_alive_data->footer = "\r\n\r\n"; - - // put everything in buffers - keep_alive_data->buffers.push_back(boost::asio::buffer(keep_alive_data->header)); - keep_alive_data->buffers.push_back(boost::asio::buffer(keep_alive_data->data->data(), keep_alive_data->data->size())); - keep_alive_data->buffers.push_back(boost::asio::buffer(keep_alive_data->footer)); - - boost::asio::async_write( *socket, - keep_alive_data->buffers, - strand.wrap(boost::bind( - &webserver::dont_close_socket, - this, - keep_alive_data))); - +void webserver::send_data(std::shared_ptr socket, + const char* content_type, + std::shared_ptr> data) { + num_answers++; + + // store all the data that we need to keep alive + std::shared_ptr keep_alive_data = + std::make_shared(); + + // keep data ptr alive + keep_alive_data->data = data; + + // vector to keep data alive + std::shared_ptr> keep_ = + std::make_shared>(); + + // generate header + std::stringstream ss; + ss << "HTTP/1.0 200 OK\r\n" + << "Content-Type: " << content_type << "\r\n" + << "Content-Length: " << data->size() << "\r\n" + << "Connection: Keep-Alive\r\n" + //<< "Connection: Close\r\n" + << "\r\n"; + keep_alive_data->header = ss.str(); + + // generate footer + keep_alive_data->footer = "\r\n\r\n"; + + // put everything in buffers + keep_alive_data->buffers.push_back( + boost::asio::buffer(keep_alive_data->header)); + keep_alive_data->buffers.push_back(boost::asio::buffer( + keep_alive_data->data->data(), keep_alive_data->data->size())); + keep_alive_data->buffers.push_back( + boost::asio::buffer(keep_alive_data->footer)); + + boost::asio::async_write( + *socket, keep_alive_data->buffers, + strand.wrap( + boost::bind(&webserver::dont_close_socket, this, keep_alive_data))); } -void -webserver::send_data(std::shared_ptr socket, - const char* content_type, - const char* data, - size_t data_size) -{ - - // store all the data that we need to keep alive - std::shared_ptr keep_alive_data = std::make_shared(); - - // generate header - std::stringstream ss; - ss << "HTTP/1.0 200 OK\r\n" - << "Content-Type: " << content_type << "\r\n" - << "Content-Length: " << data_size << "\r\n" - << "Connection: Keep-Alive\r\n" -// << "Connection: Close\r\n" - << "\r\n"; - keep_alive_data->header = ss.str(); - - // generate footer - keep_alive_data->footer = "\r\n\r\n"; - - // put everything in buffers - keep_alive_data->buffers.push_back(boost::asio::buffer(keep_alive_data->header)); - keep_alive_data->buffers.push_back(boost::asio::buffer(data, data_size)); - keep_alive_data->buffers.push_back(boost::asio::buffer(keep_alive_data->footer)); - - boost::asio::async_write( *socket, - keep_alive_data->buffers, - strand.wrap(boost::bind( - &webserver::dont_close_socket, - this, - keep_alive_data))); - +void webserver::send_data(std::shared_ptr socket, + const char* content_type, const char* data, + size_t data_size) { + // store all the data that we need to keep alive + std::shared_ptr keep_alive_data = + std::make_shared(); + + // generate header + std::stringstream ss; + ss << "HTTP/1.0 200 OK\r\n" + << "Content-Type: " << content_type << "\r\n" + << "Content-Length: " << data_size << "\r\n" + << "Connection: Keep-Alive\r\n" + // << "Connection: Close\r\n" + << "\r\n"; + keep_alive_data->header = ss.str(); + + // generate footer + keep_alive_data->footer = "\r\n\r\n"; + + // put everything in buffers + keep_alive_data->buffers.push_back( + boost::asio::buffer(keep_alive_data->header)); + keep_alive_data->buffers.push_back(boost::asio::buffer(data, data_size)); + keep_alive_data->buffers.push_back( + boost::asio::buffer(keep_alive_data->footer)); + + boost::asio::async_write( + *socket, keep_alive_data->buffers, + strand.wrap( + boost::bind(&webserver::dont_close_socket, this, keep_alive_data))); } - -static std::vector -parse_request(std::string uri) -{ - - // create empty vector - std::vector ret; - - // remove '/' - if(uri.size() < 1) return ret; - if(uri[0] != '/') return ret; - uri = uri.substr(1); - - size_t pos; - - // split first number - pos = uri.find('/'); - if(pos == uri.npos) return ret; - std::string zoom_string = uri.substr(0, pos); - uri = uri.substr(pos+1); - - // split second number - pos = uri.find('/'); - if(pos == uri.npos) return ret; - std::string pos_x_string = uri.substr(0, pos); - uri = uri.substr(pos+1); - - // split third number - pos = uri.find('.'); - if(pos == uri.npos) return ret; - std::string pos_y_string = uri.substr(0, pos); - uri = uri.substr(pos+1); - - // check for correct file format requested - if(uri.compare("png") != 0) return ret; - - // convert strings to long - long zoom_raw; - long pos_x_raw; - long pos_y_raw; - try { - zoom_raw = std::stol(zoom_string); - pos_x_raw = std::stol(pos_x_string); - pos_y_raw = std::stol(pos_y_string); - } catch ( std::exception e ) { - return ret; - } - - // create result vector - ret.push_back(zoom_raw); - ret.push_back(pos_x_raw); - ret.push_back(pos_y_raw); - +static std::vector parse_request(std::string uri) { + // create empty vector + std::vector ret; + + // remove '/' + if (uri.size() < 1) return ret; + if (uri[0] != '/') return ret; + uri = uri.substr(1); + + size_t pos; + + // split first number + pos = uri.find('/'); + if (pos == uri.npos) return ret; + std::string zoom_string = uri.substr(0, pos); + uri = uri.substr(pos + 1); + + // split second number + pos = uri.find('/'); + if (pos == uri.npos) return ret; + std::string pos_x_string = uri.substr(0, pos); + uri = uri.substr(pos + 1); + + // split third number + pos = uri.find('.'); + if (pos == uri.npos) return ret; + std::string pos_y_string = uri.substr(0, pos); + uri = uri.substr(pos + 1); + + // check for correct file format requested + if (uri.compare("png") != 0) return ret; + + // convert strings to long + long zoom_raw; + long pos_x_raw; + long pos_y_raw; + try { + zoom_raw = std::stol(zoom_string); + pos_x_raw = std::stol(pos_x_string); + pos_y_raw = std::stol(pos_y_string); + } catch (std::exception e) { return ret; + } -} - -void -webserver::process_request(std::shared_ptr socket, - std::string filename) -{ - - //std::cout << "process_request: " << filename << std::endl; - - // send main website if requested - if(filename == "/") - { - send_data(socket, "text/html; charset=utf-8", - mandelbrot_html, mandelbrot_html_len); - return; - } - - // send favicon if requested - if(filename == "/favicon.ico") - { - send_data(socket, "image/x-icon", - mandelbrot_ico, mandelbrot_ico_len); - return; - } - - // if filename is empty, send "bad request" - if(filename == "") - { - send_bad_request_and_close(socket); - return; - } - - // parse the filename to see if it is an image request - std::vector img_coords = parse_request(filename); - - // send 'not found' if it is not an image request - if(img_coords.size() != 3) - { - send_not_found_and_close(socket); - return; - } - - // create a new request - std::shared_ptr img_request = std::make_shared(); - - // set coords - img_request->zoom = img_coords[0]; - img_request->posx = img_coords[1]; - img_request->posy = img_coords[2]; - - // set user-ip - img_request->user_ip = socket->remote_endpoint().address().to_string(); - - // set aborted callback - img_request->abort = strand.wrap(boost::bind( - &webserver::send_server_error_and_close, - this, - socket)); - - // set stillValid callback - img_request->stillValid = boost::bind( - &webserver::is_socket_still_connected, - this, - socket); - - // set done callback - img_request->done = strand.wrap(boost::bind( - &webserver::send_data, - this, - socket, - "image/png", - _1)); - - num_requests ++; - // submit the request - req_handler->submit_request(img_request); + // create result vector + ret.push_back(zoom_raw); + ret.push_back(pos_x_raw); + ret.push_back(pos_y_raw); + return ret; } - -bool -webserver::is_socket_still_connected(std::shared_ptr socket) -{ - - return socket->is_open(); - +void webserver::process_request(std::shared_ptr socket, + std::string filename) { + // std::cout << "process_request: " << filename << std::endl; + + // send main website if requested + if (filename == "/") { + send_data(socket, "text/html; charset=utf-8", mandelbrot_html, + mandelbrot_html_len); + return; + } + + // send favicon if requested + if (filename == "/favicon.ico") { + send_data(socket, "image/x-icon", mandelbrot_ico, mandelbrot_ico_len); + return; + } + + // if filename is empty, send "bad request" + if (filename == "") { + send_bad_request_and_close(socket); + return; + } + + // parse the filename to see if it is an image request + std::vector img_coords = parse_request(filename); + + // send 'not found' if it is not an image request + if (img_coords.size() != 3) { + send_not_found_and_close(socket); + return; + } + + // create a new request + std::shared_ptr img_request = std::make_shared(); + + // set coords + img_request->zoom = img_coords[0]; + img_request->posx = img_coords[1]; + img_request->posy = img_coords[2]; + + // set user-ip + img_request->user_ip = socket->remote_endpoint().address().to_string(); + + // set aborted callback + img_request->abort = strand.wrap( + boost::bind(&webserver::send_server_error_and_close, this, socket)); + + // set stillValid callback + img_request->stillValid = + boost::bind(&webserver::is_socket_still_connected, this, socket); + + // set done callback + img_request->done = strand.wrap( + boost::bind(&webserver::send_data, this, socket, "image/png", _1)); + + num_requests++; + // submit the request + req_handler->submit_request(img_request); } -void -webserver::new_line_read_callback( - const boost::system::error_code & error, - size_t bytes_transferred, - std::shared_ptr socket, - std::shared_ptr buffer, - size_t lines_read, - std::string requested_file) -{ - - // check for errors - if(error) - { - std::cout << error.message() << std::endl; - // on error, close socket and return without querying the next line read - socket->close(); - return; - } - - // create an input stream from buffer - std::istream instream(&(*buffer)); - - // create a string to hold the read line - std::string line; +bool webserver::is_socket_still_connected(std::shared_ptr socket) { + return socket->is_open(); +} - // read data to the string - std::getline(instream, line); +void webserver::new_line_read_callback( + const boost::system::error_code& error, size_t bytes_transferred, + std::shared_ptr socket, + std::shared_ptr buffer, size_t lines_read, + std::string requested_file) { + // check for errors + if (error) { + std::cout << error.message() << std::endl; + // on error, close socket and return without querying the next line read + socket->close(); + return; + } - // remove whitespace at front and end of string - boost::algorithm::trim(line); + // create an input stream from buffer + std::istream instream(&(*buffer)); - // if this is the first line, read query information from it - if(lines_read == 0) - { + // create a string to hold the read line + std::string line; - // read filename from line - std::string filename = read_filename_from_request(line); - if(filename == "") - { - // send error - send_bad_request_and_close(socket); - return; - } + // read data to the string + std::getline(instream, line); - // remember filename - requested_file = filename; + // remove whitespace at front and end of string + boost::algorithm::trim(line); + // if this is the first line, read query information from it + if (lines_read == 0) { + // read filename from line + std::string filename = read_filename_from_request(line); + if (filename == "") { + // send error + send_bad_request_and_close(socket); + return; } - // if this is the last line, process the request - else if (line.size() == 0) - { - // process the request - process_request(socket, requested_file); - - // continue reading lines, could be persistent socket - start_reading_line_from_socket(buffer, socket, 0, ""); - return; - } + // remember filename + requested_file = filename; - // query the next line - start_reading_line_from_socket(buffer, socket, lines_read + 1, requested_file); + } + // if this is the last line, process the request + else if (line.size() == 0) { + // process the request + process_request(socket, requested_file); -} + // continue reading lines, could be persistent socket + start_reading_line_from_socket(buffer, socket, 0, ""); -void -webserver::start_reading_line_from_socket( - std::shared_ptr buffer, - std::shared_ptr socket, - size_t num_lines_already_read, - std::string requested_file) -{ - - // connect the buffer to the stream and register data callback - boost::asio::async_read_until(*socket, *buffer, "\n", - strand.wrap( - boost::bind(&webserver::new_line_read_callback, - this, - boost::asio::placeholders::error, - boost::asio::placeholders::bytes_transferred, - socket, - buffer, - num_lines_already_read, - requested_file) - )); + return; + } + // query the next line + start_reading_line_from_socket(buffer, socket, lines_read + 1, + requested_file); } -void -webserver::new_connection_callback(const boost::system::error_code & error, - std::shared_ptr socket) -{ - // start listening for another connection, to enable multiple connections - // at once - start_listening_for_new_connection(); - - // check for error - if(error) - { - // print error message - std::cerr << "Webserver: Error while waiting for new connection: " - << error.message() << std::endl; - - // drop this connection - return; - } - - // create a new buffer for the tcp data stream - std::shared_ptr buffer = - std::make_shared(); - - // connect buffer to socket and start reading - start_reading_line_from_socket(buffer, socket, 0, ""); - +void webserver::start_reading_line_from_socket( + std::shared_ptr buffer, + std::shared_ptr socket, size_t num_lines_already_read, + std::string requested_file) { + // connect the buffer to the stream and register data callback + boost::asio::async_read_until( + *socket, *buffer, "\n", + strand.wrap(boost::bind(&webserver::new_line_read_callback, this, + boost::asio::placeholders::error, + boost::asio::placeholders::bytes_transferred, + socket, buffer, num_lines_already_read, + requested_file))); } -void -webserver::start_listening_for_new_connection() -{ - - // create new socket - std::shared_ptr connected_socket = - std::make_shared(io_service); - - // register socket and callback, to wait for new connection - acceptor.async_accept(*connected_socket, - strand.wrap( - boost::bind(&webserver::new_connection_callback, - this, - boost::asio::placeholders::error, - connected_socket))); +void webserver::new_connection_callback(const boost::system::error_code& error, + std::shared_ptr socket) { + // start listening for another connection, to enable multiple connections + // at once + start_listening_for_new_connection(); -} + // check for error + if (error) { + // print error message + std::cerr << "Webserver: Error while waiting for new connection: " + << error.message() << std::endl; -void -webserver::external_start(hpx::runtime* rt) -{ - // register this thread to hpx - registration_wrapper wrap(rt, "asio_webserver"); + // drop this connection + return; + } - // register callback for new connections - start_listening_for_new_connection(); + // create a new buffer for the tcp data stream + std::shared_ptr buffer = + std::make_shared(); - // start the io_service - io_service.run(); + // connect buffer to socket and start reading + start_reading_line_from_socket(buffer, socket, 0, ""); +} +void webserver::start_listening_for_new_connection() { + // create new socket + std::shared_ptr connected_socket = + std::make_shared(io_service); + + // register socket and callback, to wait for new connection + acceptor.async_accept( + *connected_socket, + strand.wrap(boost::bind(&webserver::new_connection_callback, this, + boost::asio::placeholders::error, + connected_socket))); } -void -webserver::start() -{ +void webserver::external_start(hpx::runtime* rt) { + // register this thread to hpx + registration_wrapper wrap(rt, "asio_webserver"); - // forward this call to an external os thread - asio_thread = hpx::compat::thread(hpx::util::bind(&webserver::external_start, - this, - hpx::get_runtime_ptr())); + // register callback for new connections + start_listening_for_new_connection(); + // start the io_service + io_service.run(); } -void -webserver::stop() -{ - - // stop the io service - io_service.stop(); +void webserver::start() { + // forward this call to an external os thread + asio_thread = hpx::compat::thread(hpx::util::bind( + &webserver::external_start, this, hpx::get_runtime_ptr())); +} - // wait for the operating system thread to finish - asio_thread.join(); +void webserver::stop() { + // stop the io service + io_service.stop(); + // wait for the operating system thread to finish + asio_thread.join(); } diff --git a/examples/opencl/mandelbrot/maps/webserver.hpp b/examples/opencl/mandelbrot/maps/webserver.hpp index 3ef434b2..c1e54fdd 100644 --- a/examples/opencl/mandelbrot/maps/webserver.hpp +++ b/examples/opencl/mandelbrot/maps/webserver.hpp @@ -16,112 +16,102 @@ #include -namespace hpx { namespace opencl { namespace examples { namespace mandelbrot { - -class webserver -{ - -public: - - // constructor - webserver(unsigned short port, requesthandler * req_handler_); - - // starts the webserver - void start(); - - // stops the webserver - void stop(); - - -private: - // the main function of the webserver - void external_start(hpx::runtime* rt); - - // enqueues listening for one new connection - void start_listening_for_new_connection(); - - // gets called when a new connection got established - void new_connection_callback( - const boost::system::error_code & error, - std::shared_ptr socket); - - // starts reading a new line from a socket, async call - void start_reading_line_from_socket( - std::shared_ptr buffer, - std::shared_ptr socket, - size_t lines_read, - std::string requested_filename); - - // gets called when a new line got read from a socket - void new_line_read_callback( - const boost::system::error_code & error, - size_t bytes_transferred, - std::shared_ptr socket, - std::shared_ptr buffer, - size_t lines_read, - std::string requested_filename); - - // callback, closes the socket and returns. - // keeps data alive until the write finishes - void close_socket(std::shared_ptr socket, - boost::any keepalive_data); - - // keeps data alive until the write finishes - void dont_close_socket(boost::any keepalive_data); - - // sends '500 Server Error' and closes the socket - void send_server_error_and_close( - std::shared_ptr socket); - - // sends '400 Bad Request' and closes the socket - void send_bad_request_and_close( - std::shared_ptr socket); - - // sends '404 Not Found' and closes the socket - void send_not_found_and_close( - std::shared_ptr socket); - - // reads the first http request line. - // returns the queried filename or "" if an error occured - std::string read_filename_from_request(std::string line); - - // gets called once per request. - // this function reads the filename and creates a corresponding answer. - void process_request(std::shared_ptr socket, - std::string filename); - - // sends data to the client and sends '100 Continue' - void send_data(std::shared_ptr socket, - const char* content_type, - const char* data, - size_t data_size); - - // sends data to the client and sends '100 Continue' - void send_data( - std::shared_ptr socket, - const char* content_type, - std::shared_ptr> data); - - // checks wether a socket is connected or not - bool is_socket_still_connected( - std::shared_ptr socket); - - -private: - requesthandler * req_handler; - boost::asio::io_service io_service; - boost::asio::ip::tcp::acceptor acceptor; - boost::asio::strand strand; - - // the main external worker thread - hpx::compat::thread asio_thread; - +namespace hpx { +namespace opencl { +namespace examples { +namespace mandelbrot { + +class webserver { + public: + // constructor + webserver(unsigned short port, requesthandler* req_handler_); + + // starts the webserver + void start(); + + // stops the webserver + void stop(); + + private: + // the main function of the webserver + void external_start(hpx::runtime* rt); + + // enqueues listening for one new connection + void start_listening_for_new_connection(); + + // gets called when a new connection got established + void new_connection_callback( + const boost::system::error_code& error, + std::shared_ptr socket); + + // starts reading a new line from a socket, async call + void start_reading_line_from_socket( + std::shared_ptr buffer, + std::shared_ptr socket, size_t lines_read, + std::string requested_filename); + + // gets called when a new line got read from a socket + void new_line_read_callback( + const boost::system::error_code& error, size_t bytes_transferred, + std::shared_ptr socket, + std::shared_ptr buffer, size_t lines_read, + std::string requested_filename); + + // callback, closes the socket and returns. + // keeps data alive until the write finishes + void close_socket(std::shared_ptr socket, + boost::any keepalive_data); + + // keeps data alive until the write finishes + void dont_close_socket(boost::any keepalive_data); + + // sends '500 Server Error' and closes the socket + void send_server_error_and_close( + std::shared_ptr socket); + + // sends '400 Bad Request' and closes the socket + void send_bad_request_and_close( + std::shared_ptr socket); + + // sends '404 Not Found' and closes the socket + void send_not_found_and_close( + std::shared_ptr socket); + + // reads the first http request line. + // returns the queried filename or "" if an error occured + std::string read_filename_from_request(std::string line); + + // gets called once per request. + // this function reads the filename and creates a corresponding answer. + void process_request(std::shared_ptr socket, + std::string filename); + + // sends data to the client and sends '100 Continue' + void send_data(std::shared_ptr socket, + const char* content_type, const char* data, size_t data_size); + + // sends data to the client and sends '100 Continue' + void send_data(std::shared_ptr socket, + const char* content_type, + std::shared_ptr> data); + + // checks wether a socket is connected or not + bool is_socket_still_connected( + std::shared_ptr socket); + + private: + requesthandler* req_handler; + boost::asio::io_service io_service; + boost::asio::ip::tcp::acceptor acceptor; + boost::asio::strand strand; + + // the main external worker thread + hpx::compat::thread asio_thread; }; - - -} } } } +} // namespace mandelbrot +} // namespace examples +} // namespace opencl +} // namespace hpx #endif - - diff --git a/examples/opencl/mandelbrot/pngwriter.cpp b/examples/opencl/mandelbrot/pngwriter.cpp index 9d341c56..9d67db37 100644 --- a/examples/opencl/mandelbrot/pngwriter.cpp +++ b/examples/opencl/mandelbrot/pngwriter.cpp @@ -13,176 +13,152 @@ #include #include - /* structure to store PNG image bytes */ -struct mem_encode -{ +struct mem_encode { char *buffer; size_t size; }; - -static void -my_png_write_data(png_structp png_ptr, png_bytep data, png_size_t length) -{ +static void my_png_write_data(png_structp png_ptr, png_bytep data, + png_size_t length) { /* with libpng15 next line causes pointer deference error; use libpng12 */ - struct mem_encode* p=(struct mem_encode*)png_get_io_ptr(png_ptr); /* was png_ptr->io_ptr */ + struct mem_encode *p = + (struct mem_encode *)png_get_io_ptr(png_ptr); /* was png_ptr->io_ptr */ size_t nsize = p->size + length; /* allocate or grow buffer */ - if(p->buffer) - p->buffer = (char *) realloc(p->buffer, nsize); + if (p->buffer) + p->buffer = (char *)realloc(p->buffer, nsize); else - p->buffer = (char *) malloc(nsize); + p->buffer = (char *)malloc(nsize); - if(!p->buffer) - png_error(png_ptr, "Write Error"); + if (!p->buffer) png_error(png_ptr, "Write Error"); /* copy new bytes to end of buffer */ memcpy(p->buffer + p->size, data, length); p->size += length; } -#define die(func, msg) { \ - HPX_THROW_EXCEPTION(hpx::no_success, (func), (msg)); \ -} - - - -static mem_encode save_png_to_mem(std::shared_ptr< std::vector > data, size_t width, size_t height) -{ - - png_structp png_ptr = NULL; - png_infop info_ptr = NULL; - size_t y; - png_uint_32 bytes_per_row; - png_byte **row_pointers = NULL; - - // set up png_ptr - png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); - if(png_ptr == NULL){ - die("png_create_write_struct()", "Returned NULL"); - } - - // set up info_ptr - info_ptr = png_create_info_struct(png_ptr); - if (info_ptr == NULL) { - png_destroy_write_struct(&png_ptr, NULL); - die("png_create_info_stuct()", "Returned NULL"); - } - - /* Set up error handling. */ - if (setjmp(png_jmpbuf(png_ptr))) { - png_destroy_write_struct(&png_ptr, &info_ptr); - die("save_png_to_mem()", "Error callback called!"); - } - - /* Set image attributes. */ - png_set_IHDR(png_ptr, - info_ptr, - (png_uint_32)width, - (png_uint_32)height, - 8, - PNG_COLOR_TYPE_RGB, - PNG_INTERLACE_NONE, - PNG_COMPRESSION_TYPE_DEFAULT, - PNG_FILTER_TYPE_DEFAULT); - - /* Initialize the rows of png */ - bytes_per_row = (png_uint_32) (width * sizeof(char) * 3); - row_pointers = (png_byte **)png_malloc(png_ptr, height * sizeof(png_byte *)); - for (y = 0; y < height; ++y) { - row_pointers[y] = (png_byte *)(data->data() + 3*y*width); - } - - - /* static */ - struct mem_encode state; - - /* initialise - put this before png_write_png() call */ - state.buffer = NULL; - state.size = 0; - - /* if my_png_flush() is not needed, change the arg to NULL */ - png_set_write_fn(png_ptr, &state, my_png_write_data, NULL); - - /* the actual write */ - png_set_rows(png_ptr, info_ptr, row_pointers); - png_write_png(png_ptr, info_ptr, PNG_TRANSFORM_IDENTITY, NULL); - - /* Cleanup. */ - png_free(png_ptr, row_pointers); - - /* Finish writing. */ +#define die(func, msg) \ + { HPX_THROW_EXCEPTION(hpx::no_success, (func), (msg)); } + +static mem_encode save_png_to_mem(std::shared_ptr > data, + size_t width, size_t height) { + png_structp png_ptr = NULL; + png_infop info_ptr = NULL; + size_t y; + png_uint_32 bytes_per_row; + png_byte **row_pointers = NULL; + + // set up png_ptr + png_ptr = png_create_write_struct(PNG_LIBPNG_VER_STRING, NULL, NULL, NULL); + if (png_ptr == NULL) { + die("png_create_write_struct()", "Returned NULL"); + } + + // set up info_ptr + info_ptr = png_create_info_struct(png_ptr); + if (info_ptr == NULL) { + png_destroy_write_struct(&png_ptr, NULL); + die("png_create_info_stuct()", "Returned NULL"); + } + + /* Set up error handling. */ + if (setjmp(png_jmpbuf(png_ptr))) { png_destroy_write_struct(&png_ptr, &info_ptr); + die("save_png_to_mem()", "Error callback called!"); + } - /* now state.buffer contains the PNG image of size s.size bytes */ + /* Set image attributes. */ + png_set_IHDR(png_ptr, info_ptr, (png_uint_32)width, (png_uint_32)height, 8, + PNG_COLOR_TYPE_RGB, PNG_INTERLACE_NONE, + PNG_COMPRESSION_TYPE_DEFAULT, PNG_FILTER_TYPE_DEFAULT); - return state; + /* Initialize the rows of png */ + bytes_per_row = (png_uint_32)(width * sizeof(char) * 3); + row_pointers = (png_byte **)png_malloc(png_ptr, height * sizeof(png_byte *)); + for (y = 0; y < height; ++y) { + row_pointers[y] = (png_byte *)(data->data() + 3 * y * width); + } -} + /* static */ + struct mem_encode state; + /* initialise - put this before png_write_png() call */ + state.buffer = NULL; + state.size = 0; -boost::shared_array create_png(std::shared_ptr< std::vector > data, size_t width, size_t height, size_t * size) -{ + /* if my_png_flush() is not needed, change the arg to NULL */ + png_set_write_fn(png_ptr, &state, my_png_write_data, NULL); - // Create png in memory - mem_encode png_data = save_png_to_mem(data, width, height); + /* the actual write */ + png_set_rows(png_ptr, info_ptr, row_pointers); + png_write_png(png_ptr, info_ptr, PNG_TRANSFORM_IDENTITY, NULL); - // Wrap png in shared_array for auto-deletion - boost::shared_array png(png_data.buffer); + /* Cleanup. */ + png_free(png_ptr, row_pointers); - // write size to external variable - *size = png_data.size; + /* Finish writing. */ + png_destroy_write_struct(&png_ptr, &info_ptr); - // return the png - return png; + /* now state.buffer contains the PNG image of size s.size bytes */ + return state; } -void png_write_to_file(boost::shared_array png, size_t png_size, const char* filename) -{ - - // Open file - std::ofstream file(filename, std::ios::out | std::ios::binary | std::ios::trunc); - - // Ensure that file is open - if(!file.is_open()) - { - die("png_write_to_file()", "Can't open file!"); - } +boost::shared_array create_png(std::shared_ptr > data, + size_t width, size_t height, + size_t *size) { + // Create png in memory + mem_encode png_data = save_png_to_mem(data, width, height); - // Write to file - file.write(png.get(), png_size); + // Wrap png in shared_array for auto-deletion + boost::shared_array png(png_data.buffer); - // Close file - file.close(); + // write size to external variable + *size = png_data.size; + // return the png + return png; } -void save_png(std::shared_ptr< std::vector > data, size_t width, size_t height, const char* filename) -{ +void png_write_to_file(boost::shared_array png, size_t png_size, + const char *filename) { + // Open file + std::ofstream file(filename, + std::ios::out | std::ios::binary | std::ios::trunc); - size_t png_size; + // Ensure that file is open + if (!file.is_open()) { + die("png_write_to_file()", "Can't open file!"); + } - boost::shared_array png = create_png(data, width, height, &png_size); - - png_write_to_file(png, png_size, filename); + // Write to file + file.write(png.get(), png_size); + // Close file + file.close(); } -void save_png_it(std::shared_ptr< std::vector > data, size_t width, size_t height, size_t it) -{ +void save_png(std::shared_ptr > data, size_t width, + size_t height, const char *filename) { + size_t png_size; + + boost::shared_array png = create_png(data, width, height, &png_size); - size_t png_size; + png_write_to_file(png, png_size, filename); +} - boost::shared_array png = create_png(data, width, height, &png_size); +void save_png_it(std::shared_ptr > data, size_t width, + size_t height, size_t it) { + size_t png_size; - std::string filename; - filename.append("Mandelbrot_"); - filename.append(std::to_string(it)); - filename.append(".png"); + boost::shared_array png = create_png(data, width, height, &png_size); - png_write_to_file(png, png_size, filename.c_str()); + std::string filename; + filename.append("Mandelbrot_"); + filename.append(std::to_string(it)); + filename.append(".png"); + png_write_to_file(png, png_size, filename.c_str()); } diff --git a/examples/opencl/mandelbrot/pngwriter.hpp b/examples/opencl/mandelbrot/pngwriter.hpp index e52891ac..c025b9be 100644 --- a/examples/opencl/mandelbrot/pngwriter.hpp +++ b/examples/opencl/mandelbrot/pngwriter.hpp @@ -6,19 +6,21 @@ #ifndef MANDELBROT_PNG_WRITER_H_ #define MANDELBROT_PNG_WRITER_H_ - #include #include #include #include // writes data to png file -void save_png(std::shared_ptr< std::vector > data, size_t width, size_t height, const char* filename); +void save_png(std::shared_ptr > data, size_t width, + size_t height, const char* filename); -boost::shared_array create_png(std::shared_ptr< std::vector > data, size_t width, size_t height, size_t * size); +boost::shared_array create_png(std::shared_ptr > data, + size_t width, size_t height, size_t* size); -void png_write_to_file(boost::shared_array png, size_t png_size, const char* filename); +void png_write_to_file(boost::shared_array png, size_t png_size, + const char* filename); -void save_pngi_it(std::shared_ptr< std::vector > data, size_t width, size_t height, size_t it); +void save_pngi_it(std::shared_ptr > data, size_t width, + size_t height, size_t it); #endif - diff --git a/examples/opencl/mandelbrot/timer.hpp b/examples/opencl/mandelbrot/timer.hpp index 8e63bbab..1bc928b8 100644 --- a/examples/opencl/mandelbrot/timer.hpp +++ b/examples/opencl/mandelbrot/timer.hpp @@ -10,32 +10,20 @@ static boost::posix_time::ptime start_time; -static void timer_start() -{ - - // Measure start time - start_time = boost::posix_time::microsec_clock::local_time(); - +static void timer_start() { + // Measure start time + start_time = boost::posix_time::microsec_clock::local_time(); } -static double timer_stop() -{ - - // Measure stop time - boost::posix_time::ptime stop_time = - boost::posix_time::microsec_clock::local_time(); - - // Calculate difference - boost::posix_time::time_duration diff = stop_time - start_time; +static double timer_stop() { + // Measure stop time + boost::posix_time::ptime stop_time = + boost::posix_time::microsec_clock::local_time(); - return diff.total_microseconds() / 1000.0; + // Calculate difference + boost::posix_time::time_duration diff = stop_time - start_time; + return diff.total_microseconds() / 1000.0; } - - - - -#endif // BENCHMARK_TIMER_H_ - - +#endif // BENCHMARK_TIMER_H_ diff --git a/examples/opencl/mandelbrot/work_queue.hpp b/examples/opencl/mandelbrot/work_queue.hpp index d007a52b..9283fb39 100644 --- a/examples/opencl/mandelbrot/work_queue.hpp +++ b/examples/opencl/mandelbrot/work_queue.hpp @@ -12,7 +12,7 @@ /** * @brief A thread safe multi-consumer-buffer * - * The purpose of this class is to receive workloads from one master and + * The purpose of this class is to receive workloads from one master and * queue it until one of many workers request it. * The workers will then compute the workload and return it to this class, * so the master can receive the computed workload @@ -20,180 +20,148 @@ * Therefore, this class needs to be completely threadsafe. */ template -class work_queue -{ - -public: - - /** - * @brief Sends an undone workload to a worker. - * - * Gets called by the workers. - * - * @param wp Returns a workload that needs computation - * @return False on end of work - */ - bool request(T* wp); - - /** - * @brief Hands in a finished workload from a worker - * - * Gets called by the workers. - * - * @param done_workload The ready computed workload - */ - void deliver(const T &done_workload); - - /** - * @brief Adds undone workloads to the work pool. - * - * Gets called by the master. - * - * Calling this function after finish() will lead to - * undefined behaviour. - * - * @param undone_workload A new workload - */ - void add_work(const T &undone_workload); - - /** - * @brief Retrieves a finished work packet - * - * Gets called by the master. - * - * @param done_workload Returns a finished workload - * @return false to signal all work done - */ - bool retrieve_finished_work(T* done_workload); - - /** - * @brief Signals that all work is done - * - * Gets called by the master. - */ - void finish(); -public: - work_queue(); - -private: - // holds the undone work - fifo unfinished_work; - - // holds the done work - fifo finished_work; - - // saves how much work is left - std::atomic num_work; - std::atomic num_requested; - std::atomic num_delivered; - - // is true as the end-of-work-signal arrives - std::atomic_bool finished; - +class work_queue { + public: + /** + * @brief Sends an undone workload to a worker. + * + * Gets called by the workers. + * + * @param wp Returns a workload that needs computation + * @return False on end of work + */ + bool request(T* wp); + + /** + * @brief Hands in a finished workload from a worker + * + * Gets called by the workers. + * + * @param done_workload The ready computed workload + */ + void deliver(const T& done_workload); + + /** + * @brief Adds undone workloads to the work pool. + * + * Gets called by the master. + * + * Calling this function after finish() will lead to + * undefined behaviour. + * + * @param undone_workload A new workload + */ + void add_work(const T& undone_workload); + + /** + * @brief Retrieves a finished work packet + * + * Gets called by the master. + * + * @param done_workload Returns a finished workload + * @return false to signal all work done + */ + bool retrieve_finished_work(T* done_workload); + + /** + * @brief Signals that all work is done + * + * Gets called by the master. + */ + void finish(); + + public: + work_queue(); + + private: + // holds the undone work + fifo unfinished_work; + + // holds the done work + fifo finished_work; + + // saves how much work is left + std::atomic num_work; + std::atomic num_requested; + std::atomic num_delivered; + + // is true as the end-of-work-signal arrives + std::atomic_bool finished; }; - -template -work_queue::work_queue() -{ - - num_work = 0; - num_requested = 0; - num_delivered = 0; - finished = false; -} - -template -void work_queue::add_work(const T &undone_workload) -{ - - // Make sure queue is not funished yet - BOOST_ASSERT(!finished); - - // Add the workload packet - unfinished_work.push(undone_workload); - +template +work_queue::work_queue() { + num_work = 0; + num_requested = 0; + num_delivered = 0; + finished = false; } -template -bool work_queue::request(T* undone_workload) -{ - - // Store number of workloads that are currently active - // needs to be done before retrieving work packet to prefent race condition - num_work++; - - // get new work packet - if(!unfinished_work.pop(undone_workload)) - { - // set input queue state to finished. - // from now on we will only wait for returned packets. - // as soon as the num_work is zero, we know everything is done. - finished = true; - - // decrease work counter as we couldn't get a work packet - size_t work_left = --num_work; - - // if all work packets got returned, close the result queue - if(work_left == 0) - { - finished_work.finish(); - } - - // stop worker, no more work to be done - return false; - } - - num_requested++; - // successfully aquired new work packet. - return true; +template +void work_queue::add_work(const T& undone_workload) { + // Make sure queue is not funished yet + BOOST_ASSERT(!finished); + // Add the workload packet + unfinished_work.push(undone_workload); } -template -void work_queue::deliver(const T &done_workload) -{ - num_delivered++; - // add to finished queue - finished_work.push(done_workload); - - // check wether all work packets got returned +template +bool work_queue::request(T* undone_workload) { + // Store number of workloads that are currently active + // needs to be done before retrieving work packet to prefent race condition + num_work++; + + // get new work packet + if (!unfinished_work.pop(undone_workload)) { + // set input queue state to finished. + // from now on we will only wait for returned packets. + // as soon as the num_work is zero, we know everything is done. + finished = true; + + // decrease work counter as we couldn't get a work packet size_t work_left = --num_work; // if all work packets got returned, close the result queue - if(finished && work_left == 0) - { - finished_work.finish(); + if (work_left == 0) { + finished_work.finish(); } -} - - -template -bool work_queue::retrieve_finished_work(T* done_workload) -{ - - return finished_work.pop(done_workload); + // stop worker, no more work to be done + return false; + } + num_requested++; + // successfully aquired new work packet. + return true; } -template -void work_queue::finish() -{ - - // close unfinished work queue - unfinished_work.finish(); - // Working with atomics, no sync necessary - // finished = true; - +template +void work_queue::deliver(const T& done_workload) { + num_delivered++; + // add to finished queue + finished_work.push(done_workload); + + // check wether all work packets got returned + size_t work_left = --num_work; + + // if all work packets got returned, close the result queue + if (finished && work_left == 0) { + finished_work.finish(); + } } +template +bool work_queue::retrieve_finished_work(T* done_workload) { + return finished_work.pop(done_workload); +} +template +void work_queue::finish() { + // close unfinished work queue + unfinished_work.finish(); + // Working with atomics, no sync necessary + // finished = true; +} - - - - - - -#endif +#endif diff --git a/examples/opencl/mandelbrot/workload.cpp b/examples/opencl/mandelbrot/workload.cpp index 50826465..1d74e90e 100644 --- a/examples/opencl/mandelbrot/workload.cpp +++ b/examples/opencl/mandelbrot/workload.cpp @@ -5,32 +5,21 @@ #include "workload.hpp" -workload::workload(size_t num_pixels_x_, - size_t num_pixels_y_, - double topleft_x_, - double topleft_y_, - double hor_pixdist_x_, - double hor_pixdist_y_, - double vert_pixdist_x_, - double vert_pixdist_y_, - size_t img_id_, - size_t pos_in_img_x_, - size_t pos_in_img_y_, - size_t line_offset_) - : pixeldata(hpx::serialization::serialize_buffer()), - num_pixels_x(num_pixels_x_), - num_pixels_y(num_pixels_y_), - topleft_x(topleft_x_), - topleft_y(topleft_y_), - hor_pixdist_x(hor_pixdist_x_), - hor_pixdist_y(hor_pixdist_y_), - vert_pixdist_x(vert_pixdist_x_), - vert_pixdist_y(vert_pixdist_y_), - img_id(img_id_), - pos_in_img_x(pos_in_img_x_), - pos_in_img_y(pos_in_img_y_), - line_offset(line_offset_) - {}; - - - +workload::workload(size_t num_pixels_x_, size_t num_pixels_y_, + double topleft_x_, double topleft_y_, double hor_pixdist_x_, + double hor_pixdist_y_, double vert_pixdist_x_, + double vert_pixdist_y_, size_t img_id_, size_t pos_in_img_x_, + size_t pos_in_img_y_, size_t line_offset_) + : pixeldata(hpx::serialization::serialize_buffer()), + num_pixels_x(num_pixels_x_), + num_pixels_y(num_pixels_y_), + topleft_x(topleft_x_), + topleft_y(topleft_y_), + hor_pixdist_x(hor_pixdist_x_), + hor_pixdist_y(hor_pixdist_y_), + vert_pixdist_x(vert_pixdist_x_), + vert_pixdist_y(vert_pixdist_y_), + img_id(img_id_), + pos_in_img_x(pos_in_img_x_), + pos_in_img_y(pos_in_img_y_), + line_offset(line_offset_){}; diff --git a/examples/opencl/mandelbrot/workload.hpp b/examples/opencl/mandelbrot/workload.hpp index f939e453..adc68af4 100644 --- a/examples/opencl/mandelbrot/workload.hpp +++ b/examples/opencl/mandelbrot/workload.hpp @@ -17,44 +17,32 @@ * A workload, defines a mandelbrot line and will be filled by workers with * the calculated results */ -class workload -{ - - public: - workload(size_t num_pixels_x_, - size_t num_pixels_y_, - double topleft_x_, - double topleft_y_, - double hor_pixdist_x_, - double hor_pixdist_y_, - double vert_pixdist_x_, - double vert_pixdist_y_, - size_t img_id_, - size_t pos_in_img_x_, - size_t pos_in_img_y_, - size_t line_offset_); - - // Will hold the calculated pixels - hpx::serialization::serialize_buffer pixeldata; - // the number of pixels on the rectangle - size_t num_pixels_x; - size_t num_pixels_y; - // the top left point of the rectangle - double topleft_x; - double topleft_y; - // the horizontal offset between pixels - double hor_pixdist_x; - double hor_pixdist_y; - // the vertical offset between pixels - double vert_pixdist_x; - double vert_pixdist_y; - // metadata for correct mapping to image - size_t img_id; - size_t pos_in_img_x; - size_t pos_in_img_y; - size_t line_offset; - +class workload { + public: + workload(size_t num_pixels_x_, size_t num_pixels_y_, double topleft_x_, + double topleft_y_, double hor_pixdist_x_, double hor_pixdist_y_, + double vert_pixdist_x_, double vert_pixdist_y_, size_t img_id_, + size_t pos_in_img_x_, size_t pos_in_img_y_, size_t line_offset_); + + // Will hold the calculated pixels + hpx::serialization::serialize_buffer pixeldata; + // the number of pixels on the rectangle + size_t num_pixels_x; + size_t num_pixels_y; + // the top left point of the rectangle + double topleft_x; + double topleft_y; + // the horizontal offset between pixels + double hor_pixdist_x; + double hor_pixdist_y; + // the vertical offset between pixels + double vert_pixdist_x; + double vert_pixdist_y; + // metadata for correct mapping to image + size_t img_id; + size_t pos_in_img_x; + size_t pos_in_img_y; + size_t line_offset; }; #endif - diff --git a/external/asio/boost/asio/ip/basic_resolver_iterator.hpp b/external/asio/boost/asio/ip/basic_resolver_iterator.hpp index 0dfec47b..a271f537 100644 --- a/external/asio/boost/asio/ip/basic_resolver_iterator.hpp +++ b/external/asio/boost/asio/ip/basic_resolver_iterator.hpp @@ -12,8 +12,8 @@ #define BOOST_ASIO_IP_BASIC_RESOLVER_ITERATOR_HPP #if defined(_MSC_VER) && (_MSC_VER >= 1200) -# pragma once -#endif // defined(_MSC_VER) && (_MSC_VER >= 1200) +#pragma once +#endif // defined(_MSC_VER) && (_MSC_VER >= 1200) #include #include @@ -45,9 +45,8 @@ namespace ip { * @e Shared @e objects: Unsafe. */ template -class basic_resolver_iterator -{ -public: +class basic_resolver_iterator { + public: /// The type used for the distance between two iterators. typedef std::ptrdiff_t difference_type; @@ -64,32 +63,23 @@ class basic_resolver_iterator typedef std::forward_iterator_tag iterator_category; /// Default constructor creates an end iterator. - basic_resolver_iterator() - : index_(0) - { - } + basic_resolver_iterator() : index_(0) {} + + basic_resolver_iterator(basic_resolver_iterator const& o) + : index_(o.index_), values_(o.values_) {} - basic_resolver_iterator(basic_resolver_iterator const & o) - : index_(o.index_) - , values_(o.values_) - {} - - basic_resolver_iterator(basic_resolver_iterator && o) - : index_(o.index_) - , values_(std::move(o.values_)) - { - o.values_.reset(); - o.index_ = 0; + basic_resolver_iterator(basic_resolver_iterator&& o) + : index_(o.index_), values_(std::move(o.values_)) { + o.values_.reset(); + o.index_ = 0; } /// Create an iterator from an addrinfo list returned by getaddrinfo. static basic_resolver_iterator create( boost::asio::detail::addrinfo_type* address_info, - const std::string& host_name, const std::string& service_name) - { + const std::string& host_name, const std::string& service_name) { basic_resolver_iterator iter; - if (!address_info) - return std::move(iter); + if (!address_info) return std::move(iter); std::string actual_host_name = host_name; if (address_info->ai_canonname) @@ -97,19 +87,16 @@ class basic_resolver_iterator iter.values_.reset(new values_type); - while (address_info) - { - if (address_info->ai_family == PF_INET - || address_info->ai_family == PF_INET6) - { - using namespace std; // For memcpy. + while (address_info) { + if (address_info->ai_family == PF_INET || + address_info->ai_family == PF_INET6) { + using namespace std; // For memcpy. typename InternetProtocol::endpoint endpoint; endpoint.resize(static_cast(address_info->ai_addrlen)); memcpy(endpoint.data(), address_info->ai_addr, - address_info->ai_addrlen); - iter.values_->push_back( - basic_resolver_entry(endpoint, - actual_host_name, service_name)); + address_info->ai_addrlen); + iter.values_->push_back(basic_resolver_entry( + endpoint, actual_host_name, service_name)); } address_info = address_info->ai_next; } @@ -120,38 +107,32 @@ class basic_resolver_iterator /// Create an iterator from an endpoint, host name and service name. static basic_resolver_iterator create( const typename InternetProtocol::endpoint& endpoint, - const std::string& host_name, const std::string& service_name) - { + const std::string& host_name, const std::string& service_name) { basic_resolver_iterator iter; iter.values_.reset(new values_type); - iter.values_->push_back( - basic_resolver_entry( - endpoint, host_name, service_name)); + iter.values_->push_back(basic_resolver_entry( + endpoint, host_name, service_name)); return std::move(iter); } /// Dereference an iterator. - const basic_resolver_entry& operator*() const - { + const basic_resolver_entry& operator*() const { return dereference(); } /// Dereference an iterator. - const basic_resolver_entry* operator->() const - { + const basic_resolver_entry* operator->() const { return &dereference(); } /// Increment operator (prefix). - basic_resolver_iterator& operator++() - { + basic_resolver_iterator& operator++() { increment(); return *this; } /// Increment operator (postfix). - basic_resolver_iterator operator++(int) - { + basic_resolver_iterator operator++(int) { basic_resolver_iterator tmp(*this); ++*this; return tmp; @@ -159,40 +140,32 @@ class basic_resolver_iterator /// Test two iterators for equality. friend bool operator==(const basic_resolver_iterator& a, - const basic_resolver_iterator& b) - { + const basic_resolver_iterator& b) { return a.equal(b); } /// Test two iterators for inequality. friend bool operator!=(const basic_resolver_iterator& a, - const basic_resolver_iterator& b) - { + const basic_resolver_iterator& b) { return !a.equal(b); } -private: - void increment() - { - if (++index_ == values_->size()) - { + private: + void increment() { + if (++index_ == values_->size()) { // Reset state to match a default constructed end iterator. values_.reset(); index_ = 0; } } - bool equal(const basic_resolver_iterator& other) const - { - if (!values_ && !other.values_) - return true; - if (values_ != other.values_) - return false; + bool equal(const basic_resolver_iterator& other) const { + if (!values_ && !other.values_) return true; + if (values_ != other.values_) return false; return index_ == other.index_; } - const basic_resolver_entry& dereference() const - { + const basic_resolver_entry& dereference() const { return (*values_)[index_]; } @@ -201,10 +174,10 @@ class basic_resolver_iterator detail::shared_ptr values_; }; -} // namespace ip -} // namespace asio -} // namespace boost +} // namespace ip +} // namespace asio +} // namespace boost #include -#endif // BOOST_ASIO_IP_BASIC_RESOLVER_ITERATOR_HPP +#endif // BOOST_ASIO_IP_BASIC_RESOLVER_ITERATOR_HPP diff --git a/external/atomic/boost/atomic.hpp b/external/atomic/boost/atomic.hpp index 59aa312c..5f70fc51 100644 --- a/external/atomic/boost/atomic.hpp +++ b/external/atomic/boost/atomic.hpp @@ -18,168 +18,168 @@ namespace boost { -template +template class atomic : public detail::atomic::internal_atomic { -public: - typedef detail::atomic::internal_atomic super; - - atomic() {} - explicit atomic(T v) : super(v) {} -private: - atomic(const atomic &); - void operator=(const atomic &); -}; + public: + typedef detail::atomic::internal_atomic super; + + atomic() {} + explicit atomic(T v) : super(v) {} + private: + atomic(const atomic &); + void operator=(const atomic &); +}; -template<> +template <> class atomic : private detail::atomic::internal_atomic { -public: - typedef detail::atomic::internal_atomic super; - - atomic() {} - explicit atomic(bool v) : super(v) {} - - using super::load; - using super::store; - using super::compare_exchange_strong; - using super::compare_exchange_weak; - using super::exchange; - using super::is_lock_free; - - operator bool(void) const volatile {return load();} - bool operator=(bool v) volatile {store(v); return v;} -private: - atomic(const atomic &); - void operator=(const atomic &); + public: + typedef detail::atomic::internal_atomic super; + + atomic() {} + explicit atomic(bool v) : super(v) {} + + using super::compare_exchange_strong; + using super::compare_exchange_weak; + using super::exchange; + using super::is_lock_free; + using super::load; + using super::store; + + operator bool(void) const volatile { return load(); } + bool operator=(bool v) volatile { + store(v); + return v; + } + + private: + atomic(const atomic &); + void operator=(const atomic &); }; -template<> -class atomic : private detail::atomic - ::internal_atomic { -public: - typedef detail::atomic::internal_atomic super; - - atomic() {} - explicit atomic(void * p) : super(p) {} - using super::load; - using super::store; - using super::compare_exchange_strong; - using super::compare_exchange_weak; - using super::exchange; - using super::is_lock_free; - - operator void *(void) const volatile {return load();} - void * operator=(void * v) volatile {store(v); return v;} - -private: - atomic(const atomic &); - void * operator=(const atomic &); +template <> +class atomic + : private detail::atomic ::internal_atomic { + public: + typedef detail::atomic::internal_atomic super; + + atomic() {} + explicit atomic(void *p) : super(p) {} + using super::compare_exchange_strong; + using super::compare_exchange_weak; + using super::exchange; + using super::is_lock_free; + using super::load; + using super::store; + + operator void *(void) const volatile { return load(); } + void *operator=(void *v) volatile { + store(v); + return v; + } + + private: + atomic(const atomic &); + void *operator=(const atomic &); }; /* FIXME: pointer arithmetic still missing */ -template +template class atomic : private detail::atomic::internal_atomic { -public: - typedef detail::atomic::internal_atomic super; - - atomic() {} - explicit atomic(T * p) : super(static_cast(p)) {} - - T *load(memory_order order=memory_order_seq_cst) const volatile - { - return static_cast(super::load(order)); - } - void store(T *v, memory_order order=memory_order_seq_cst) volatile - { - super::store(static_cast(v), order); - } - bool compare_exchange_strong( - T * &expected, - T * desired, - memory_order order=memory_order_seq_cst) volatile - { - return compare_exchange_strong(expected, desired, order, - detail::atomic::calculate_failure_order(order)); - } - bool compare_exchange_weak( - T * &expected, - T *desired, - memory_order order=memory_order_seq_cst) volatile - { - return compare_exchange_weak(expected, desired, order, - detail::atomic::calculate_failure_order(order)); - } - bool compare_exchange_weak( - T * &expected, - T *desired, - memory_order success_order, - memory_order failure_order) volatile - { - void * expected_=static_cast(expected); - void * desired_=static_cast(desired); - bool success=super::compare_exchange_weak(expected_, desired_, - success_order, failure_order); - expected=static_cast(expected_); - return success; - } - bool compare_exchange_strong( - T * &expected, - T *desired, - memory_order success_order, - memory_order failure_order) volatile - { - void * expected_=static_cast(expected); - void * desired_=static_cast(desired); - bool success=super::compare_exchange_strong(expected_, desired_, - success_order, failure_order); - expected=static_cast(expected_); - return success; - } - T *exchange(T * replacement, memory_order order=memory_order_seq_cst) volatile - { - return static_cast(super::exchange(static_cast(replacement), order)); - } - using super::is_lock_free; - - operator T *(void) const volatile {return load();} - T * operator=(T * v) volatile {store(v); return v;} - - T * fetch_add(ptrdiff_t diff, memory_order order=memory_order_seq_cst) volatile - { - return static_cast(super::fetch_add(diff*sizeof(T), order)); - } - T * fetch_sub(ptrdiff_t diff, memory_order order=memory_order_seq_cst) volatile - { - return static_cast(super::fetch_sub(diff*sizeof(T), order)); - } - - T *operator++(void) volatile {return fetch_add(1)+1;} - T *operator++(int) volatile {return fetch_add(1);} - T *operator--(void) volatile {return fetch_sub(1)-1;} - T *operator--(int) volatile {return fetch_sub(1);} -private: - atomic(const atomic &); - T * operator=(const atomic &); + public: + typedef detail::atomic::internal_atomic super; + + atomic() {} + explicit atomic(T *p) : super(static_cast(p)) {} + + T *load(memory_order order = memory_order_seq_cst) const volatile { + return static_cast(super::load(order)); + } + void store(T *v, memory_order order = memory_order_seq_cst) volatile { + super::store(static_cast(v), order); + } + bool compare_exchange_strong( + T *&expected, T *desired, + memory_order order = memory_order_seq_cst) volatile { + return compare_exchange_strong( + expected, desired, order, + detail::atomic::calculate_failure_order(order)); + } + bool compare_exchange_weak( + T *&expected, T *desired, + memory_order order = memory_order_seq_cst) volatile { + return compare_exchange_weak( + expected, desired, order, + detail::atomic::calculate_failure_order(order)); + } + bool compare_exchange_weak(T *&expected, T *desired, + memory_order success_order, + memory_order failure_order) volatile { + void *expected_ = static_cast(expected); + void *desired_ = static_cast(desired); + bool success = super::compare_exchange_weak(expected_, desired_, + success_order, failure_order); + expected = static_cast(expected_); + return success; + } + bool compare_exchange_strong(T *&expected, T *desired, + memory_order success_order, + memory_order failure_order) volatile { + void *expected_ = static_cast(expected); + void *desired_ = static_cast(desired); + bool success = super::compare_exchange_strong(expected_, desired_, + success_order, failure_order); + expected = static_cast(expected_); + return success; + } + T *exchange(T *replacement, + memory_order order = memory_order_seq_cst) volatile { + return static_cast( + super::exchange(static_cast(replacement), order)); + } + using super::is_lock_free; + + operator T *(void) const volatile { return load(); } + T *operator=(T *v) volatile { + store(v); + return v; + } + + T *fetch_add(ptrdiff_t diff, + memory_order order = memory_order_seq_cst) volatile { + return static_cast(super::fetch_add(diff * sizeof(T), order)); + } + T *fetch_sub(ptrdiff_t diff, + memory_order order = memory_order_seq_cst) volatile { + return static_cast(super::fetch_sub(diff * sizeof(T), order)); + } + + T *operator++(void) volatile { return fetch_add(1) + 1; } + T *operator++(int) volatile { return fetch_add(1); } + T *operator--(void) volatile { return fetch_sub(1) - 1; } + T *operator--(int) volatile { return fetch_sub(1); } + + private: + atomic(const atomic &); + T *operator=(const atomic &); }; class atomic_flag : private atomic { -public: - typedef atomic super; - using super::is_lock_free; - - atomic_flag(bool initial_state) : super(initial_state?1:0) {} - atomic_flag() {} - - bool test_and_set(memory_order order=memory_order_seq_cst) - { - return super::exchange(1, order) != 0; - } - void clear(memory_order order=memory_order_seq_cst) - { - super::store(0, order); - } + public: + typedef atomic super; + using super::is_lock_free; + + atomic_flag(bool initial_state) : super(initial_state ? 1 : 0) {} + atomic_flag() {} + + bool test_and_set(memory_order order = memory_order_seq_cst) { + return super::exchange(1, order) != 0; + } + void clear(memory_order order = memory_order_seq_cst) { + super::store(0, order); + } }; typedef atomic atomic_char; @@ -208,18 +208,16 @@ typedef atomic<__uint128_t> atomic_uint128_t; typedef atomic<__int128_t> atomic_int128_t; #endif #if BOOST_MSVC >= 1500 && (defined(_M_IA64) || defined(_M_AMD64)) - && defined(BOOST_ATOMIC_HAVE_SSE2) -typedef atomic<__m128i> atomic_uint128_t; +&&defined(BOOST_ATOMIC_HAVE_SSE2) typedef atomic<__m128i> atomic_uint128_t; typedef atomic<__m128i> atomic_int128_t; #endif -typedef atomic atomic_address; +typedef atomic atomic_address; typedef atomic atomic_bool; -static inline void atomic_thread_fence(memory_order order) -{ - detail::atomic::platform_atomic_thread_fence(order); +static inline void atomic_thread_fence(memory_order order) { + detail::atomic::platform_atomic_thread_fence(order); } -} +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/base.hpp b/external/atomic/boost/atomic/detail/base.hpp index 8e22cc94..140c5c7f 100644 --- a/external/atomic/boost/atomic/detail/base.hpp +++ b/external/atomic/boost/atomic/detail/base.hpp @@ -15,183 +15,182 @@ namespace boost { namespace detail { namespace atomic { -static inline memory_order calculate_failure_order(memory_order order) -{ - switch(order) { - case memory_order_acq_rel: return memory_order_acquire; - case memory_order_release: return memory_order_relaxed; - default: return order; - } +static inline memory_order calculate_failure_order(memory_order order) { + switch (order) { + case memory_order_acq_rel: + return memory_order_acquire; + case memory_order_release: + return memory_order_relaxed; + default: + return order; + } } -template +template class platform_atomic : public fallback_atomic { -public: - typedef fallback_atomic super; + public: + typedef fallback_atomic super; - explicit platform_atomic(T v) : super(v) {} - platform_atomic() {} -protected: - typedef typename super::integral_type integral_type; + explicit platform_atomic(T v) : super(v) {} + platform_atomic() {} + + protected: + typedef typename super::integral_type integral_type; }; -template +template class platform_atomic_integral : public build_atomic_from_exchange > { -public: - typedef build_atomic_from_exchange > super; + public: + typedef build_atomic_from_exchange > super; + + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral() {} - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral() {} -protected: - typedef typename super::integral_type integral_type; + protected: + typedef typename super::integral_type integral_type; }; -template -static inline void platform_atomic_thread_fence(T order) -{ - /* FIXME: this does not provide - sequential consistency, need one global - variable for that... */ - platform_atomic a; - a.exchange(T(), order); +template +static inline void platform_atomic_thread_fence(T order) { + /* FIXME: this does not provide + sequential consistency, need one global + variable for that... */ + platform_atomic a; + a.exchange(T(), order); } -template::test> +template ::test> class internal_atomic; -template -class internal_atomic : private detail::atomic::platform_atomic { -public: - typedef detail::atomic::platform_atomic super; - - internal_atomic() {} - explicit internal_atomic(T v) : super(v) {} - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; - using super::load; - using super::store; - using super::exchange; - - bool compare_exchange_strong( - T &expected, - T desired, - memory_order order=memory_order_seq_cst) volatile - { - return super::compare_exchange_strong(expected, desired, - order, calculate_failure_order(order)); - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order order=memory_order_seq_cst) volatile - { - return super::compare_exchange_strong(expected, desired, - order, calculate_failure_order(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return super::compare_exchange_strong(expected, desired, - success_order, failure_order); - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return super::compare_exchange_strong(expected, desired, - success_order, failure_order); - } -private: - internal_atomic(const internal_atomic &); - void operator=(const internal_atomic &); +template +class internal_atomic + : private detail::atomic::platform_atomic { + public: + typedef detail::atomic::platform_atomic super; + + internal_atomic() {} + explicit internal_atomic(T v) : super(v) {} + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::exchange; + using super::is_lock_free; + using super::load; + using super::store; + + bool compare_exchange_strong( + T &expected, T desired, + memory_order order = memory_order_seq_cst) volatile { + return super::compare_exchange_strong(expected, desired, order, + calculate_failure_order(order)); + } + bool compare_exchange_weak( + T &expected, T desired, + memory_order order = memory_order_seq_cst) volatile { + return super::compare_exchange_strong(expected, desired, order, + calculate_failure_order(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + return super::compare_exchange_strong(expected, desired, success_order, + failure_order); + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return super::compare_exchange_strong(expected, desired, success_order, + failure_order); + } + + private: + internal_atomic(const internal_atomic &); + void operator=(const internal_atomic &); }; -template +template class internal_atomic : private detail::atomic::platform_atomic_integral { -public: - typedef detail::atomic::platform_atomic_integral super; - typedef typename super::integral_type integral_type; - - internal_atomic() {} - explicit internal_atomic(T v) : super(v) {} - - using super::is_lock_free; - using super::load; - using super::store; - using super::exchange; - using super::fetch_add; - using super::fetch_sub; - using super::fetch_and; - using super::fetch_or; - using super::fetch_xor; - - operator integral_type(void) const volatile {return load();} - integral_type operator=(integral_type v) volatile {store(v); return v;} - - integral_type operator&=(integral_type c) volatile {return fetch_and(c)&c;} - integral_type operator|=(integral_type c) volatile {return fetch_or(c)|c;} - integral_type operator^=(integral_type c) volatile {return fetch_xor(c)^c;} - - integral_type operator+=(integral_type c) volatile {return fetch_add(c)+c;} - integral_type operator-=(integral_type c) volatile {return fetch_sub(c)-c;} - - integral_type operator++(void) volatile {return fetch_add(1)+1;} - integral_type operator++(int) volatile {return fetch_add(1);} - integral_type operator--(void) volatile {return fetch_sub(1)-1;} - integral_type operator--(int) volatile {return fetch_sub(1);} - - bool compare_exchange_strong( - integral_type &expected, - integral_type desired, - memory_order order=memory_order_seq_cst) volatile - { - return super::compare_exchange_strong(expected, desired, - order, calculate_failure_order(order)); - } - bool compare_exchange_weak( - integral_type &expected, - integral_type desired, - memory_order order=memory_order_seq_cst) volatile - { - return super::compare_exchange_strong(expected, desired, order, - calculate_failure_order(order)); - } - bool compare_exchange_strong( - integral_type &expected, - integral_type desired, - memory_order success_order, - memory_order failure_order) volatile - { - return super::compare_exchange_strong(expected, desired, - success_order, failure_order); - } - bool compare_exchange_weak( - integral_type &expected, - integral_type desired, - memory_order success_order, - memory_order failure_order) volatile - { - return super::compare_exchange_strong(expected, desired, - success_order, failure_order); - } -private: - internal_atomic(const internal_atomic &); - void operator=(const internal_atomic &); + public: + typedef detail::atomic::platform_atomic_integral super; + typedef typename super::integral_type integral_type; + + internal_atomic() {} + explicit internal_atomic(T v) : super(v) {} + + using super::exchange; + using super::fetch_add; + using super::fetch_and; + using super::fetch_or; + using super::fetch_sub; + using super::fetch_xor; + using super::is_lock_free; + using super::load; + using super::store; + + operator integral_type(void) const volatile { return load(); } + integral_type operator=(integral_type v) volatile { + store(v); + return v; + } + + integral_type operator&=(integral_type c) volatile { + return fetch_and(c) & c; + } + integral_type operator|=(integral_type c) volatile { return fetch_or(c) | c; } + integral_type operator^=(integral_type c) volatile { + return fetch_xor(c) ^ c; + } + + integral_type operator+=(integral_type c) volatile { + return fetch_add(c) + c; + } + integral_type operator-=(integral_type c) volatile { + return fetch_sub(c) - c; + } + + integral_type operator++(void) volatile { return fetch_add(1) + 1; } + integral_type operator++(int) volatile { return fetch_add(1); } + integral_type operator--(void) volatile { return fetch_sub(1) - 1; } + integral_type operator--(int) volatile { return fetch_sub(1); } + + bool compare_exchange_strong( + integral_type &expected, integral_type desired, + memory_order order = memory_order_seq_cst) volatile { + return super::compare_exchange_strong(expected, desired, order, + calculate_failure_order(order)); + } + bool compare_exchange_weak( + integral_type &expected, integral_type desired, + memory_order order = memory_order_seq_cst) volatile { + return super::compare_exchange_strong(expected, desired, order, + calculate_failure_order(order)); + } + bool compare_exchange_strong(integral_type &expected, integral_type desired, + memory_order success_order, + memory_order failure_order) volatile { + return super::compare_exchange_strong(expected, desired, success_order, + failure_order); + } + bool compare_exchange_weak(integral_type &expected, integral_type desired, + memory_order success_order, + memory_order failure_order) volatile { + return super::compare_exchange_strong(expected, desired, success_order, + failure_order); + } + + private: + internal_atomic(const internal_atomic &); + void operator=(const internal_atomic &); }; -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/builder.hpp b/external/atomic/boost/atomic/detail/builder.hpp index 22b59121..f2c0edf1 100644 --- a/external/atomic/boost/atomic/detail/builder.hpp +++ b/external/atomic/boost/atomic/detail/builder.hpp @@ -23,39 +23,38 @@ given a Base that implements: generates exchange and compare_exchange_strong */ -template +template class build_exchange : public Base { -public: - typedef typename Base::integral_type integral_type; - - using Base::load; - using Base::compare_exchange_weak; - - bool compare_exchange_strong( - integral_type &expected, - integral_type desired, - memory_order success_order, - memory_order failure_order) volatile - { - integral_type expected_save=expected; - while(true) { - if (compare_exchange_weak(expected, desired, - success_order, failure_order)) return true; - if (expected_save!=expected) return false; - expected=expected_save; - } + public: + typedef typename Base::integral_type integral_type; + + using Base::compare_exchange_weak; + using Base::load; + + bool compare_exchange_strong(integral_type &expected, integral_type desired, + memory_order success_order, + memory_order failure_order) volatile { + integral_type expected_save = expected; + while (true) { + if (compare_exchange_weak(expected, desired, success_order, + failure_order)) + return true; + if (expected_save != expected) return false; + expected = expected_save; } - - integral_type exchange(integral_type replacement, - memory_order order=memory_order_seq_cst) volatile - { - integral_type o=load(memory_order_relaxed); - do {} while(!compare_exchange_weak(o, replacement, order, memory_order_relaxed)); - return o; - } - - build_exchange() {} - explicit build_exchange(integral_type i) : Base(i) {} + } + + integral_type exchange(integral_type replacement, + memory_order order = memory_order_seq_cst) volatile { + integral_type o = load(memory_order_relaxed); + do { + } while ( + !compare_exchange_weak(o, replacement, order, memory_order_relaxed)); + return o; + } + + build_exchange() {} + explicit build_exchange(integral_type i) : Base(i) {} }; /* @@ -70,30 +69,31 @@ is constant +1/-1, and uses fetch_add_var otherwise the intention is to allow optimizing the incredibly common case of +1/-1 */ -template +template class build_const_fetch_add : public Base { -public: - typedef typename Base::integral_type integral_type; - - integral_type fetch_add( - integral_type c, - memory_order order=memory_order_seq_cst) volatile - { - if (__builtin_constant_p(c)) { - switch(c) { - case -1: return fetch_dec(order); - case 1: return fetch_inc(order); - } - } - return fetch_add_var(c, order); + public: + typedef typename Base::integral_type integral_type; + + integral_type fetch_add(integral_type c, + memory_order order = memory_order_seq_cst) volatile { + if (__builtin_constant_p(c)) { + switch (c) { + case -1: + return fetch_dec(order); + case 1: + return fetch_inc(order); + } } + return fetch_add_var(c, order); + } + + build_const_fetch_add() {} + explicit build_const_fetch_add(integral_type i) : Base(i) {} - build_const_fetch_add() {} - explicit build_const_fetch_add(integral_type i) : Base(i) {} -protected: - using Base::fetch_add_var; - using Base::fetch_inc; - using Base::fetch_dec; + protected: + using Base::fetch_add_var; + using Base::fetch_dec; + using Base::fetch_inc; }; /* @@ -105,24 +105,24 @@ integral_type desired, memory_order order) generates a -- not very efficient, but correct -- fetch_add operation */ -template +template class build_fetch_add : public Base { -public: - typedef typename Base::integral_type integral_type; - - using Base::compare_exchange_weak; - - integral_type fetch_add( - integral_type c, memory_order order=memory_order_seq_cst) volatile - { - integral_type o=Base::load(memory_order_relaxed), n; - do {n=o+c;} while(!compare_exchange_weak(o, n, - order, memory_order_relaxed)); - return o; - } - - build_fetch_add() {} - explicit build_fetch_add(integral_type i) : Base(i) {} + public: + typedef typename Base::integral_type integral_type; + + using Base::compare_exchange_weak; + + integral_type fetch_add(integral_type c, + memory_order order = memory_order_seq_cst) volatile { + integral_type o = Base::load(memory_order_relaxed), n; + do { + n = o + c; + } while (!compare_exchange_weak(o, n, order, memory_order_relaxed)); + return o; + } + + build_fetch_add() {} + explicit build_fetch_add(integral_type i) : Base(i) {} }; /* @@ -132,30 +132,28 @@ given a Base that implements: generates fetch_sub and post/pre- increment/decrement operators */ -template +template class build_arithmeticops : public Base { -public: - typedef typename Base::integral_type integral_type; + public: + typedef typename Base::integral_type integral_type; - using Base::fetch_add; + using Base::fetch_add; - integral_type fetch_sub( - integral_type c, - memory_order order=memory_order_seq_cst) volatile - { + integral_type fetch_sub(integral_type c, + memory_order order = memory_order_seq_cst) volatile { #if defined(BOOST_MSVC) #pragma warning(push) -#pragma warning(disable: 4146) - // unary minus operator applied to unsigned type, result still unsigned +#pragma warning(disable : 4146) + // unary minus operator applied to unsigned type, result still unsigned #endif - return fetch_add(-c, order); + return fetch_add(-c, order); #if defined(BOOST_MSVC) #pragma warning(pop) #endif - } + } - build_arithmeticops() {} - explicit build_arithmeticops(integral_type i_) : Base(i_) {} + build_arithmeticops() {} + explicit build_arithmeticops(integral_type i_) : Base(i_) {} }; /* @@ -168,41 +166,41 @@ integral_type desired, memory_order order) generates -- not very efficient, but correct -- fetch_and, fetch_or and fetch_xor operators */ -template +template class build_logicops : public Base { -public: - typedef typename Base::integral_type integral_type; - - using Base::compare_exchange_weak; - using Base::load; - - integral_type fetch_and(integral_type c, - memory_order order=memory_order_seq_cst) volatile - { - integral_type o=load(memory_order_relaxed), n; - do {n=o&c;} while(!compare_exchange_weak(o, n, order, - memory_order_relaxed)); - return o; - } - integral_type fetch_or(integral_type c, - memory_order order=memory_order_seq_cst) volatile - { - integral_type o=load(memory_order_relaxed), n; - do {n=o|c;} while(!compare_exchange_weak(o, n, order, - memory_order_relaxed)); - return o; - } - integral_type fetch_xor(integral_type c, - memory_order order=memory_order_seq_cst) volatile - { - integral_type o=load(memory_order_relaxed), n; - do {n=o^c;} while(!compare_exchange_weak(o, n, order, - memory_order_relaxed)); - return o; - } - - build_logicops() {} - build_logicops(integral_type i_) : Base(i_) {} + public: + typedef typename Base::integral_type integral_type; + + using Base::compare_exchange_weak; + using Base::load; + + integral_type fetch_and(integral_type c, + memory_order order = memory_order_seq_cst) volatile { + integral_type o = load(memory_order_relaxed), n; + do { + n = o & c; + } while (!compare_exchange_weak(o, n, order, memory_order_relaxed)); + return o; + } + integral_type fetch_or(integral_type c, + memory_order order = memory_order_seq_cst) volatile { + integral_type o = load(memory_order_relaxed), n; + do { + n = o | c; + } while (!compare_exchange_weak(o, n, order, memory_order_relaxed)); + return o; + } + integral_type fetch_xor(integral_type c, + memory_order order = memory_order_seq_cst) volatile { + integral_type o = load(memory_order_relaxed), n; + do { + n = o ^ c; + } while (!compare_exchange_weak(o, n, order, memory_order_relaxed)); + return o; + } + + build_logicops() {} + build_logicops(integral_type i_) : Base(i_) {} }; /* @@ -215,17 +213,18 @@ integral_type desired, memory_order order) generates the full set of atomic operations for integral types */ -template +template class build_atomic_from_minimal - : public build_logicops< build_arithmeticops< - build_fetch_add< build_exchange > > > { -public: - typedef build_logicops< build_arithmeticops< - build_fetch_add< build_exchange > > > super; - typedef typename super::integral_type integral_type; - - build_atomic_from_minimal(void) {} - build_atomic_from_minimal(typename super::integral_type i) : super(i) {} + : public build_logicops< + build_arithmeticops > > > { + public: + typedef build_logicops< + build_arithmeticops > > > + super; + typedef typename super::integral_type integral_type; + + build_atomic_from_minimal(void) {} + build_atomic_from_minimal(typename super::integral_type i) : super(i) {} }; /* @@ -244,15 +243,17 @@ integral_type desired, memory_order order) generates the full set of atomic operations for integral types */ -template +template class build_atomic_from_typical - : public build_logicops< build_arithmeticops< build_const_fetch_add > > { -public: - typedef build_logicops< build_arithmeticops< build_const_fetch_add > > super; - typedef typename super::integral_type integral_type; - - build_atomic_from_typical(void) {} - build_atomic_from_typical(typename super::integral_type i) : super(i) {} + : public build_logicops< + build_arithmeticops > > { + public: + typedef build_logicops > > + super; + typedef typename super::integral_type integral_type; + + build_atomic_from_typical(void) {} + build_atomic_from_typical(typename super::integral_type i) : super(i) {} }; /* @@ -269,15 +270,15 @@ integral_type desired, memory_order order) generates the full set of atomic operations for integral types */ -template +template class build_atomic_from_add - : public build_logicops< build_arithmeticops > { -public: - typedef build_logicops< build_arithmeticops > super; - typedef typename super::integral_type integral_type; + : public build_logicops > { + public: + typedef build_logicops > super; + typedef typename super::integral_type integral_type; - build_atomic_from_add(void) {} - build_atomic_from_add(typename super::integral_type i_) : super(i_) {} + build_atomic_from_add(void) {} + build_atomic_from_add(typename super::integral_type i_) : super(i_) {} }; /* @@ -293,18 +294,17 @@ integral_type desired, memory_order order) generates the full set of atomic operations for integral types */ -template +template class build_atomic_from_exchange - : public build_logicops< build_arithmeticops< build_fetch_add > > { -public: - typedef build_logicops< build_arithmeticops< build_fetch_add > > super; - typedef typename super::integral_type integral_type; + : public build_logicops > > { + public: + typedef build_logicops > > super; + typedef typename super::integral_type integral_type; - build_atomic_from_exchange(void) {} - build_atomic_from_exchange(typename super::integral_type i_) : super(i_) {} + build_atomic_from_exchange(void) {} + build_atomic_from_exchange(typename super::integral_type i_) : super(i_) {} }; - /* given a Base that implements: @@ -314,105 +314,97 @@ generates load, store and compare_exchange_weak for a smaller data type (e.g. an atomic "byte" embedded into a temporary and properly aligned atomic "int"). */ -template +template class build_base_from_larger_type { -public: - typedef Type integral_type; - - build_base_from_larger_type() {} - build_base_from_larger_type(integral_type t) {store(t, memory_order_relaxed);} - - integral_type load(memory_order order=memory_order_seq_cst) const volatile - { - larger_integral_type v=get_base().load(order); - return extract(v); - } - bool compare_exchange_weak(integral_type &expected, - integral_type desired, - memory_order success_order, - memory_order failure_order) volatile - { - larger_integral_type expected_; - larger_integral_type desired_; - - expected_=get_base().load(memory_order_relaxed); - expected_=insert(expected_, expected); - desired_=insert(expected_, desired); - bool success=get_base().compare_exchange_weak(expected_, desired_, - success_order, failure_order); - expected=extract(expected_); - return success; - } - void store(integral_type v, - memory_order order=memory_order_seq_cst) volatile - { - larger_integral_type expected, desired; - expected=get_base().load(memory_order_relaxed); - do { - desired=insert(expected, v); - } while(!get_base().compare_exchange_weak(expected, - desired, order, memory_order_relaxed)); - } - - bool is_lock_free(void) - { - return get_base().is_lock_free(); - } -private: - typedef typename Base::integral_type larger_integral_type; - - const Base &get_base(void) const volatile - { - intptr_t address=(intptr_t)this; //static_cast(this); - address&=~(sizeof(larger_integral_type)-1); - return *reinterpret_cast(address); - } - Base &get_base(void) volatile - { - intptr_t address=(intptr_t)this; //static_cast(this); - address&=~(sizeof(larger_integral_type)-1); - return *reinterpret_cast(address); - } - intptr_t get_offset(void) const volatile - { - intptr_t address=(intptr_t)this; //static_cast(this); - address&=(sizeof(larger_integral_type)-1); - return address; - } - - intptr_t get_shift(void) const volatile - { + public: + typedef Type integral_type; + + build_base_from_larger_type() {} + build_base_from_larger_type(integral_type t) { + store(t, memory_order_relaxed); + } + + integral_type load(memory_order order = memory_order_seq_cst) const volatile { + larger_integral_type v = get_base().load(order); + return extract(v); + } + bool compare_exchange_weak(integral_type &expected, integral_type desired, + memory_order success_order, + memory_order failure_order) volatile { + larger_integral_type expected_; + larger_integral_type desired_; + + expected_ = get_base().load(memory_order_relaxed); + expected_ = insert(expected_, expected); + desired_ = insert(expected_, desired); + bool success = get_base().compare_exchange_weak( + expected_, desired_, success_order, failure_order); + expected = extract(expected_); + return success; + } + void store(integral_type v, + memory_order order = memory_order_seq_cst) volatile { + larger_integral_type expected, desired; + expected = get_base().load(memory_order_relaxed); + do { + desired = insert(expected, v); + } while (!get_base().compare_exchange_weak(expected, desired, order, + memory_order_relaxed)); + } + + bool is_lock_free(void) { return get_base().is_lock_free(); } + + private: + typedef typename Base::integral_type larger_integral_type; + + const Base &get_base(void) const volatile { + intptr_t address = (intptr_t)this; // static_cast(this); + address &= ~(sizeof(larger_integral_type) - 1); + return *reinterpret_cast(address); + } + Base &get_base(void) volatile { + intptr_t address = (intptr_t)this; // static_cast(this); + address &= ~(sizeof(larger_integral_type) - 1); + return *reinterpret_cast(address); + } + intptr_t get_offset(void) const volatile { + intptr_t address = (intptr_t)this; // static_cast(this); + address &= (sizeof(larger_integral_type) - 1); + return address; + } + + intptr_t get_shift(void) const volatile { #if defined(BOOST_LITTLE_ENDIAN) - return get_offset()*8; -#elif defined(BOOST_BIG_ENDIAN) - return (sizeof(larger_integral_type)-sizeof(integral_type)-get_offset())*8; + return get_offset() * 8; +#elif defined(BOOST_BIG_ENDIAN) + return (sizeof(larger_integral_type) - sizeof(integral_type) - + get_offset()) * + 8; #else - #error "Unknown endian" +#error "Unknown endian" #endif - } + } - integral_type extract(larger_integral_type v) const volatile - { - return v>>get_shift(); - } + integral_type extract(larger_integral_type v) const volatile { + return v >> get_shift(); + } - larger_integral_type insert(larger_integral_type target, - integral_type source) const volatile - { - larger_integral_type tmp=source; - larger_integral_type mask=larger_integral_type(-1); + larger_integral_type insert(larger_integral_type target, + integral_type source) const volatile { + larger_integral_type tmp = source; + larger_integral_type mask = larger_integral_type(-1); - mask=~(mask<<(8*sizeof(integral_type))); + mask = ~(mask << (8 * sizeof(integral_type))); - mask=mask< +template class build_atomic_from_larger_type - : public build_atomic_from_minimal< build_base_from_larger_type > { -public: - typedef build_atomic_from_minimal< build_base_from_larger_type > super; - //typedef typename super::integral_type integral_type; - typedef Type integral_type; - - build_atomic_from_larger_type() {} - build_atomic_from_larger_type(integral_type v) : super(v) {} + : public build_atomic_from_minimal< + build_base_from_larger_type > { + public: + typedef build_atomic_from_minimal > + super; + // typedef typename super::integral_type integral_type; + typedef Type integral_type; + + build_atomic_from_larger_type() {} + build_atomic_from_larger_type(integral_type v) : super(v) {} }; -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/fallback.hpp b/external/atomic/boost/atomic/detail/fallback.hpp index ceca6e9d..fd848735 100644 --- a/external/atomic/boost/atomic/detail/fallback.hpp +++ b/external/atomic/boost/atomic/detail/fallback.hpp @@ -14,63 +14,56 @@ namespace boost { namespace detail { namespace atomic { -template +template class fallback_atomic { -public: - fallback_atomic(void) {} - explicit fallback_atomic(const T &t) {memcpy(&i, &t, sizeof(T));} + public: + fallback_atomic(void) {} + explicit fallback_atomic(const T &t) { memcpy(&i, &t, sizeof(T)); } - void store(const T &t, memory_order order=memory_order_seq_cst) volatile - { - detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); - memcpy((void*)&i, &t, sizeof(T)); + void store(const T &t, memory_order order = memory_order_seq_cst) volatile { + detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); + memcpy((void *)&i, &t, sizeof(T)); + } + T load(memory_order /*order*/ = memory_order_seq_cst) volatile const { + detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); + T tmp; + memcpy(&tmp, const_cast(&i), sizeof(T)); + return tmp; + } + bool compare_exchange_strong(T &expected, T desired, + memory_order /*success_order*/, + memory_order /*failure_order*/) volatile { + detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); + if (memcmp((void *)&i, &expected, sizeof(T)) == 0) { + memcpy((void *)&i, &desired, sizeof(T)); + return true; + } else { + memcpy(&expected, (void *)&i, sizeof(T)); + return false; } - T load(memory_order /*order*/=memory_order_seq_cst) volatile const - { - detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); - T tmp; - memcpy(&tmp, const_cast(&i), sizeof(T)); - return tmp; - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order /*success_order*/, - memory_order /*failure_order*/) volatile - { - detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); - if (memcmp((void*)&i, &expected, sizeof(T))==0) { - memcpy((void*)&i, &desired, sizeof(T)); - return true; - } else { - memcpy(&expected, (void*)&i, sizeof(T)); - return false; - } - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T replacement, memory_order /*order*/=memory_order_seq_cst) volatile - { - detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); - T tmp; - memcpy(&tmp, (void*)&i, sizeof(T)); - memcpy((void*)&i, &replacement, sizeof(T)); - return tmp; - } - bool is_lock_free(void) const volatile {return false;} -protected: - T i; - typedef T integral_type; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T replacement, + memory_order /*order*/ = memory_order_seq_cst) volatile { + detail::spinlock_pool<0>::scoped_lock guard(const_cast(&i)); + T tmp; + memcpy(&tmp, (void *)&i, sizeof(T)); + memcpy((void *)&i, &replacement, sizeof(T)); + return tmp; + } + bool is_lock_free(void) const volatile { return false; } + + protected: + T i; + typedef T integral_type; }; -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/gcc-alpha.hpp b/external/atomic/boost/atomic/detail/gcc-alpha.hpp index d8538376..881c54db 100644 --- a/external/atomic/boost/atomic/detail/gcc-alpha.hpp +++ b/external/atomic/boost/atomic/detail/gcc-alpha.hpp @@ -44,315 +44,294 @@ namespace boost { namespace detail { namespace atomic { -static inline void fence_before(memory_order order) -{ - switch(order) { - case memory_order_consume: - case memory_order_release: - case memory_order_acq_rel: - case memory_order_seq_cst: - __asm__ __volatile__ ("mb" ::: "memory"); - default:; - } +static inline void fence_before(memory_order order) { + switch (order) { + case memory_order_consume: + case memory_order_release: + case memory_order_acq_rel: + case memory_order_seq_cst: + __asm__ __volatile__("mb" ::: "memory"); + default:; + } } -static inline void fence_after(memory_order order) -{ - switch(order) { - case memory_order_acquire: - case memory_order_acq_rel: - case memory_order_seq_cst: - __asm__ __volatile__ ("mb" ::: "memory"); - default:; - } +static inline void fence_after(memory_order order) { + switch (order) { + case memory_order_acquire: + case memory_order_acq_rel: + case memory_order_seq_cst: + __asm__ __volatile__("mb" ::: "memory"); + default:; + } } -template<> -inline void platform_atomic_thread_fence(memory_order order) -{ - switch(order) { - case memory_order_acquire: - case memory_order_consume: - case memory_order_release: - case memory_order_acq_rel: - case memory_order_seq_cst: - __asm__ __volatile__ ("mb" ::: "memory"); - default:; - } +template <> +inline void platform_atomic_thread_fence(memory_order order) { + switch (order) { + case memory_order_acquire: + case memory_order_consume: + case memory_order_release: + case memory_order_acq_rel: + case memory_order_seq_cst: + __asm__ __volatile__("mb" ::: "memory"); + default:; + } } -template +template class atomic_alpha_32 { -public: - typedef T integral_type; - explicit atomic_alpha_32(T v) : i(v) {} - atomic_alpha_32() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - fence_before(order); - *reinterpret_cast(&i)=(int)v; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - int current, success; - __asm__ __volatile__( - "1: ldl_l %2, %4\n" - "cmpeq %2, %0, %3\n" - "mov %2, %0\n" - "beq %3, 3f\n" - "stl_c %1, %4\n" - "2:\n" - - ".subsection 2\n" - "3: mov %3, %1\n" - "br 2b\n" - ".previous\n" - - : "+&r" (expected), "+&r" (desired), "=&r"(current), "=&r"(success) - : "m" (i) - : - ); - if (desired) fence_after(success_order); - else fence_after(failure_order); - return desired; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - inline T fetch_add_var(T c, memory_order order) volatile - { - fence_before(order); - T original, modified; - __asm__ __volatile__( - "1: ldl_l %0, %2\n" - "addl %0, %3, %1\n" - "stl_c %1, %2\n" - "beq %1, 2f\n" - - ".subsection 2\n" - "2: br 1b\n" - ".previous\n" - - : "=&r" (original), "=&r" (modified) - : "m" (i), "r" (c) - : - ); - fence_after(order); - return original; - } - inline T fetch_inc(memory_order order) volatile - { - fence_before(order); - int original, modified; - __asm__ __volatile__( - "1: ldl_l %0, %2\n" - "addl %0, 1, %1\n" - "stl_c %1, %2\n" - "beq %1, 2f\n" - - ".subsection 2\n" - "2: br 1b\n" - ".previous\n" - - : "=&r" (original), "=&r" (modified) - : "m" (i) - : - ); - fence_after(order); - return original; - } - inline T fetch_dec(memory_order order) volatile - { - fence_before(order); - int original, modified; - __asm__ __volatile__( - "1: ldl_l %0, %2\n" - "subl %0, 1, %1\n" - "stl_c %1, %2\n" - "beq %1, 2f\n" - - ".subsection 2\n" - "2: br 1b\n" - ".previous\n" - - : "=&r" (original), "=&r" (modified) - : "m" (i) - : - ); - fence_after(order); - return original; - } -private: - T i; + public: + typedef T integral_type; + explicit atomic_alpha_32(T v) : i(v) {} + atomic_alpha_32() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + fence_before(order); + *reinterpret_cast(&i) = (int)v; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + int current, success; + __asm__ __volatile__( + "1: ldl_l %2, %4\n" + "cmpeq %2, %0, %3\n" + "mov %2, %0\n" + "beq %3, 3f\n" + "stl_c %1, %4\n" + "2:\n" + + ".subsection 2\n" + "3: mov %3, %1\n" + "br 2b\n" + ".previous\n" + + : "+&r"(expected), "+&r"(desired), "=&r"(current), "=&r"(success) + : "m"(i) + :); + if (desired) + fence_after(success_order); + else + fence_after(failure_order); + return desired; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + inline T fetch_add_var(T c, memory_order order) volatile { + fence_before(order); + T original, modified; + __asm__ __volatile__( + "1: ldl_l %0, %2\n" + "addl %0, %3, %1\n" + "stl_c %1, %2\n" + "beq %1, 2f\n" + + ".subsection 2\n" + "2: br 1b\n" + ".previous\n" + + : "=&r"(original), "=&r"(modified) + : "m"(i), "r"(c) + :); + fence_after(order); + return original; + } + inline T fetch_inc(memory_order order) volatile { + fence_before(order); + int original, modified; + __asm__ __volatile__( + "1: ldl_l %0, %2\n" + "addl %0, 1, %1\n" + "stl_c %1, %2\n" + "beq %1, 2f\n" + + ".subsection 2\n" + "2: br 1b\n" + ".previous\n" + + : "=&r"(original), "=&r"(modified) + : "m"(i) + :); + fence_after(order); + return original; + } + inline T fetch_dec(memory_order order) volatile { + fence_before(order); + int original, modified; + __asm__ __volatile__( + "1: ldl_l %0, %2\n" + "subl %0, 1, %1\n" + "stl_c %1, %2\n" + "beq %1, 2f\n" + + ".subsection 2\n" + "2: br 1b\n" + ".previous\n" + + : "=&r"(original), "=&r"(modified) + : "m"(i) + :); + fence_after(order); + return original; + } + + private: + T i; }; -template +template class atomic_alpha_64 { -public: - typedef T integral_type; - explicit atomic_alpha_64(T v) : i(v) {} - atomic_alpha_64() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - fence_before(order); - *reinterpret_cast(&i)=v; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - int current, success; - __asm__ __volatile__( - "1: ldq_l %2, %4\n" - "cmpeq %2, %0, %3\n" - "mov %2, %0\n" - "beq %3, 3f\n" - "stq_c %1, %4\n" - "2:\n" - - ".subsection 2\n" - "3: mov %3, %1\n" - "br 2b\n" - ".previous\n" - - : "+&r" (expected), "+&r" (desired), "=&r"(current), "=&r"(success) - : "m" (i) - : - ); - if (desired) fence_after(success_order); - else fence_after(failure_order); - return desired; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - inline T fetch_add_var(T c, memory_order order) volatile - { - fence_before(order); - T original, modified; - __asm__ __volatile__( - "1: ldq_l %0, %2\n" - "addq %0, %3, %1\n" - "stq_c %1, %2\n" - "beq %1, 2f\n" - - ".subsection 2\n" - "2: br 1b\n" - ".previous\n" - - : "=&r" (original), "=&r" (modified) - : "m" (i), "r" (c) - : - ); - fence_after(order); - return original; - } - inline T fetch_inc(memory_order order) volatile - { - fence_before(order); - T original, modified; - __asm__ __volatile__( - "1: ldq_l %0, %2\n" - "addq %0, 1, %1\n" - "stq_c %1, %2\n" - "beq %1, 2f\n" - - ".subsection 2\n" - "2: br 1b\n" - ".previous\n" - - : "=&r" (original), "=&r" (modified) - : "m" (i) - : - ); - fence_after(order); - return original; - } - inline T fetch_dec(memory_order order) volatile - { - fence_before(order); - T original, modified; - __asm__ __volatile__( - "1: ldq_l %0, %2\n" - "subq %0, 1, %1\n" - "stq_c %1, %2\n" - "beq %1, 2f\n" - - ".subsection 2\n" - "2: br 1b\n" - ".previous\n" - - : "=&r" (original), "=&r" (modified) - : "m" (i) - : - ); - fence_after(order); - return original; - } -private: - T i; + public: + typedef T integral_type; + explicit atomic_alpha_64(T v) : i(v) {} + atomic_alpha_64() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + fence_before(order); + *reinterpret_cast(&i) = v; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + int current, success; + __asm__ __volatile__( + "1: ldq_l %2, %4\n" + "cmpeq %2, %0, %3\n" + "mov %2, %0\n" + "beq %3, 3f\n" + "stq_c %1, %4\n" + "2:\n" + + ".subsection 2\n" + "3: mov %3, %1\n" + "br 2b\n" + ".previous\n" + + : "+&r"(expected), "+&r"(desired), "=&r"(current), "=&r"(success) + : "m"(i) + :); + if (desired) + fence_after(success_order); + else + fence_after(failure_order); + return desired; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + inline T fetch_add_var(T c, memory_order order) volatile { + fence_before(order); + T original, modified; + __asm__ __volatile__( + "1: ldq_l %0, %2\n" + "addq %0, %3, %1\n" + "stq_c %1, %2\n" + "beq %1, 2f\n" + + ".subsection 2\n" + "2: br 1b\n" + ".previous\n" + + : "=&r"(original), "=&r"(modified) + : "m"(i), "r"(c) + :); + fence_after(order); + return original; + } + inline T fetch_inc(memory_order order) volatile { + fence_before(order); + T original, modified; + __asm__ __volatile__( + "1: ldq_l %0, %2\n" + "addq %0, 1, %1\n" + "stq_c %1, %2\n" + "beq %1, 2f\n" + + ".subsection 2\n" + "2: br 1b\n" + ".previous\n" + + : "=&r"(original), "=&r"(modified) + : "m"(i) + :); + fence_after(order); + return original; + } + inline T fetch_dec(memory_order order) volatile { + fence_before(order); + T original, modified; + __asm__ __volatile__( + "1: ldq_l %0, %2\n" + "subq %0, 1, %1\n" + "stq_c %1, %2\n" + "beq %1, 2f\n" + + ".subsection 2\n" + "2: br 1b\n" + ".previous\n" + + : "=&r"(original), "=&r"(modified) + : "m"(i) + :); + fence_after(order); + return original; + } + + private: + T i; }; -template +template class platform_atomic_integral : public build_atomic_from_typical > > { -public: - typedef build_atomic_from_typical > > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_typical > > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_typical > > { -public: - typedef build_atomic_from_typical > > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_typical > > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/gcc-armv6plus.hpp b/external/atomic/boost/atomic/detail/gcc-armv6plus.hpp index 68dbf3a7..69e767c2 100644 --- a/external/atomic/boost/atomic/detail/gcc-armv6plus.hpp +++ b/external/atomic/boost/atomic/detail/gcc-armv6plus.hpp @@ -9,7 +9,6 @@ // Copyright (c) 2009 Phil Endecott // ARM Code by Phil Endecott, based on other architectures. - #include #include #include @@ -43,36 +42,49 @@ namespace boost { namespace detail { namespace atomic { - -// "Thumb 1" is a subset of the ARM instruction set that uses a 16-bit encoding. It -// doesn't include all instructions and in particular it doesn't include the co-processor -// instruction used for the memory barrier or the load-locked/store-conditional -// instructions. So, if we're compiling in "Thumb 1" mode, we need to wrap all of our -// asm blocks with code to temporarily change to ARM mode. +// "Thumb 1" is a subset of the ARM instruction set that uses a 16-bit encoding. +// It doesn't include all instructions and in particular it doesn't include the +// co-processor instruction used for the memory barrier or the +// load-locked/store-conditional instructions. So, if we're compiling in "Thumb +// 1" mode, we need to wrap all of our asm blocks with code to temporarily +// change to ARM mode. // // You can only change between ARM and Thumb modes when // branching using the bx instruction. -// bx takes an address specified in a register. The least significant bit of the address -// indicates the mode, so 1 is added to indicate that the destination code is Thumb. -// A temporary register is needed for the address and is passed as an argument to these -// macros. It must be one of the "low" registers accessible to Thumb code, specified -// usng the "l" attribute in the asm statement. +// bx takes an address specified in a register. The least significant bit of +// the address indicates the mode, so 1 is added to indicate that the +// destination code is Thumb. A temporary register is needed for the address and +// is passed as an argument to these macros. It must be one of the "low" +// registers accessible to Thumb code, specified usng the "l" attribute in the +// asm statement. // -// Architecture v7 introduces "Thumb 2", which does include (almost?) all of the ARM -// instruction set. So in v7 we don't need to change to ARM mode; +// Architecture v7 introduces "Thumb 2", which does include (almost?) all of the +// ARM instruction set. So in v7 we don't need to change to ARM mode; // we can write "universal -// assembler" which will assemble to Thumb 2 or ARM code as appropriate. The only thing -// we need to do to make this "universal" +// assembler" which will assemble to Thumb 2 or ARM code as appropriate. The +// only thing we need to do to make this "universal" // assembler mode work is to insert "IT" instructions -// to annotate the conditional instructions. These are ignored in other modes (e.g. v6), -// so they can always be present. +// to annotate the conditional instructions. These are ignored in other modes +// (e.g. v6), so they can always be present. #if defined(__thumb__) && !defined(__ARM_ARCH_7A__) // FIXME also other v7 variants. #define BOOST_ATOMIC_ARM_ASM_START(TMPREG) \ - "adr " #TMPREG ", 1f\n" "bx " #TMPREG "\n" ".arm\n" ".align 4\n" "1: " + "adr " #TMPREG \ + ", 1f\n" \ + "bx " #TMPREG \ + "\n" \ + ".arm\n" \ + ".align 4\n" \ + "1: " #define BOOST_ATOMIC_ARM_ASM_END(TMPREG) \ - "adr " #TMPREG ", 1f + 1\n" "bx " #TMPREG "\n" ".thumb\n" ".align 2\n" "1: " + "adr " #TMPREG \ + ", 1f + 1\n" \ + "bx " #TMPREG \ + "\n" \ + ".thumb\n" \ + ".align 2\n" \ + "1: " #else // The tmpreg is wasted in this case, which is non-optimal. @@ -80,7 +92,6 @@ namespace atomic { #define BOOST_ATOMIC_ARM_ASM_END(TMPREG) #endif - #if defined(__ARM_ARCH_7A__) // FIXME ditto. #define BOOST_ATOMIC_ARM_DMB "dmb\n" @@ -92,75 +103,61 @@ namespace atomic { // this exists in v6 as another co-processor // instruction like the above. - -static inline void fence_before(memory_order order) -{ - // FIXME I don't understand enough about barriers to know what this should do. - switch(order) { - case memory_order_release: - case memory_order_acq_rel: - case memory_order_seq_cst: - int brtmp; - __asm__ __volatile__ ( - BOOST_ATOMIC_ARM_ASM_START(%0) - BOOST_ATOMIC_ARM_DMB - BOOST_ATOMIC_ARM_ASM_END(%0) - : "=&l" (brtmp) :: "memory" - ); - default:; - } +static inline void fence_before(memory_order order) { + // FIXME I don't understand enough about barriers to know what this should do. + switch (order) { + case memory_order_release: + case memory_order_acq_rel: + case memory_order_seq_cst: + int brtmp; + __asm__ __volatile__( + BOOST_ATOMIC_ARM_ASM_START(% 0) + BOOST_ATOMIC_ARM_DMB BOOST_ATOMIC_ARM_ASM_END(% 0) + : "=&l"(brtmp)::"memory"); + default:; + } } -static inline void fence_after(memory_order order) -{ - // FIXME I don't understand enough about barriers to know what this should do. - switch(order) { - case memory_order_acquire: - case memory_order_acq_rel: - case memory_order_seq_cst: - int brtmp; - __asm__ __volatile__ ( - BOOST_ATOMIC_ARM_ASM_START(%0) - BOOST_ATOMIC_ARM_DMB - BOOST_ATOMIC_ARM_ASM_END(%0) - : "=&l" (brtmp) :: "memory" - ); - case memory_order_consume: - __asm__ __volatile__ ("" ::: "memory"); - default:; - } +static inline void fence_after(memory_order order) { + // FIXME I don't understand enough about barriers to know what this should do. + switch (order) { + case memory_order_acquire: + case memory_order_acq_rel: + case memory_order_seq_cst: + int brtmp; + __asm__ __volatile__( + BOOST_ATOMIC_ARM_ASM_START(% 0) + BOOST_ATOMIC_ARM_DMB BOOST_ATOMIC_ARM_ASM_END(% 0) + : "=&l"(brtmp)::"memory"); + case memory_order_consume: + __asm__ __volatile__("" ::: "memory"); + default:; + } } #undef BOOST_ATOMIC_ARM_DMB - -template +template class atomic_arm_4 { -public: - typedef T integral_type; - explicit atomic_arm_4(T v) : i(v) {} - atomic_arm_4() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=const_cast(i); - fence_after(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - fence_before(order); - const_cast(i)=v; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - int success; - int tmp; - __asm__ __volatile__( + public: + typedef T integral_type; + explicit atomic_arm_4(T v) : i(v) {} + atomic_arm_4() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = const_cast(i); + fence_after(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + fence_before(order); + const_cast(i) = v; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + int success; + int tmp; + __asm__ __volatile__( BOOST_ATOMIC_ARM_ASM_START(%2) "mov %1, #0\n" // success = 0 "ldrex %0, [%3]\n" // expected' = *(&i) @@ -178,19 +175,21 @@ class atomic_arm_4 { "r" ((int)desired) // %5 : "cc" ); - if (success) fence_after(success_order); - else fence_after(failure_order); - return success; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - inline T fetch_add_var(T c, memory_order order) volatile - { - fence_before(order); - T original, tmp; - int tmp2; - __asm__ __volatile__( + if (success) + fence_after(success_order); + else + fence_after(failure_order); + return success; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + inline T fetch_add_var(T c, memory_order order) volatile { + fence_before(order); + T original, tmp; + int tmp2; + __asm__ __volatile__( BOOST_ATOMIC_ARM_ASM_START(%2) "1: ldrex %0, [%3]\n" // original = *(&i) "add %1, %0, %4\n" // tmp = original + c @@ -206,15 +205,14 @@ class atomic_arm_4 { "r" (c) // %4 : "cc" ); - fence_after(order); - return original; - } - inline T fetch_inc(memory_order order) volatile - { - fence_before(order); - T original, tmp; - int tmp2; - __asm__ __volatile__( + fence_after(order); + return original; + } + inline T fetch_inc(memory_order order) volatile { + fence_before(order); + T original, tmp; + int tmp2; + __asm__ __volatile__( BOOST_ATOMIC_ARM_ASM_START(%2) "1: ldrex %0, [%3]\n" // original = *(&i) "add %1, %0, #1\n" // tmp = original + 1 @@ -229,15 +227,14 @@ class atomic_arm_4 { : "r" (&i) // %3 : "cc" ); - fence_after(order); - return original; - } - inline T fetch_dec(memory_order order) volatile - { - fence_before(order); - T original, tmp; - int tmp2; - __asm__ __volatile__( + fence_after(order); + return original; + } + inline T fetch_dec(memory_order order) volatile { + fence_before(order); + T original, tmp; + int tmp2; + __asm__ __volatile__( BOOST_ATOMIC_ARM_ASM_START(%2) "1: ldrex %0, [%3]\n" // original = *(&i) "sub %1, %0, #1\n" // tmp = original - 1 @@ -252,57 +249,53 @@ class atomic_arm_4 { : "r" (&i) // %3 : "cc" ); - fence_after(order); - return original; - } -private: - T i; -}; + fence_after(order); + return original; + } + private: + T i; +}; // #ifdef _ARM_ARCH_7 // FIXME TODO can add native byte and halfword version here - -template +template class platform_atomic_integral : public build_atomic_from_typical > > { -public: - typedef build_atomic_from_typical > > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_typical > > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; - - typedef build_exchange > platform_atomic_address; -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #undef BOOST_ATOMIC_ARM_ASM_START #undef BOOST_ATOMIC_ARM_ASM_END - #endif diff --git a/external/atomic/boost/atomic/detail/gcc-ppc.hpp b/external/atomic/boost/atomic/detail/gcc-ppc.hpp index 712527c6..e211c934 100644 --- a/external/atomic/boost/atomic/detail/gcc-ppc.hpp +++ b/external/atomic/boost/atomic/detail/gcc-ppc.hpp @@ -22,19 +22,18 @@ namespace boost { namespace detail { namespace atomic { -static inline void fence_before(memory_order order) -{ - switch(order) { - case memory_order_release: - case memory_order_acq_rel: +static inline void fence_before(memory_order order) { + switch (order) { + case memory_order_release: + case memory_order_acq_rel: #if defined(__powerpc64__) - __asm__ __volatile__ ("lwsync" ::: "memory"); - break; + __asm__ __volatile__("lwsync" ::: "memory"); + break; #endif - case memory_order_seq_cst: - __asm__ __volatile__ ("sync" ::: "memory"); - default:; - } + case memory_order_seq_cst: + __asm__ __volatile__("sync" ::: "memory"); + default:; + } } /* Note on the barrier instructions used by fence_after and @@ -59,297 +58,282 @@ for this decision: */ -static inline void fence_after(memory_order order) -{ - switch(order) { - case memory_order_acquire: - case memory_order_acq_rel: - case memory_order_seq_cst: - __asm__ __volatile__ ("isync"); - case memory_order_consume: - __asm__ __volatile__ ("" ::: "memory"); - default:; - } +static inline void fence_after(memory_order order) { + switch (order) { + case memory_order_acquire: + case memory_order_acq_rel: + case memory_order_seq_cst: + __asm__ __volatile__("isync"); + case memory_order_consume: + __asm__ __volatile__("" ::: "memory"); + default:; + } } -template<> -inline void platform_atomic_thread_fence(memory_order order) -{ - switch(order) { - case memory_order_acquire: - __asm__ __volatile__ ("isync" ::: "memory"); - break; - case memory_order_release: - case memory_order_acq_rel: +template <> +inline void platform_atomic_thread_fence(memory_order order) { + switch (order) { + case memory_order_acquire: + __asm__ __volatile__("isync" ::: "memory"); + break; + case memory_order_release: + case memory_order_acq_rel: #if defined(__powerpc64__) - __asm__ __volatile__ ("lwsync" ::: "memory"); - break; + __asm__ __volatile__("lwsync" ::: "memory"); + break; #endif - case memory_order_seq_cst: - __asm__ __volatile__ ("sync" ::: "memory"); - default:; - } + case memory_order_seq_cst: + __asm__ __volatile__("sync" ::: "memory"); + default:; + } } - /* note: the __asm__ constraint "b" instructs gcc to use any register except r0; this is required because r0 is not allowed in some places. Since I am sometimes unsure if it is allowed or not just play it safe and avoid r0 entirely -- ppc isn't exactly register-starved, so this really should not matter :) */ -template +template class atomic_ppc_32 { -public: - typedef T integral_type; - explicit atomic_ppc_32(T v) : i(v) {} - atomic_ppc_32() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - __asm__ __volatile__ ( - "cmpw %0, %0\n" - "bne- 1f\n" - "1f:\n" - : "+b"(v)); - fence_after(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - fence_before(order); - *reinterpret_cast(&i)=v; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - int success; - __asm__ __volatile__( - "lwarx %0,0,%2\n" - "cmpw %0, %3\n" - "bne- 2f\n" - "stwcx. %4,0,%2\n" - "bne- 2f\n" - "addi %1,0,1\n" - "1:" - - ".subsection 2\n" - "2: addi %1,0,0\n" - "b 1b\n" - ".previous\n" - : "=&b" (expected), "=&b" (success) - : "b" (&i), "b" (expected), "b" ((int)desired) - ); - if (success) fence_after(success_order); - else fence_after(failure_order); - return success; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - inline T fetch_add_var(T c, memory_order order) volatile - { - fence_before(order); - T original, tmp; - __asm__ __volatile__( - "1: lwarx %0,0,%2\n" - "add %1,%0,%3\n" - "stwcx. %1,0,%2\n" - "bne- 1b\n" - : "=&b" (original), "=&b" (tmp) - : "b" (&i), "b" (c) - : "cc"); - fence_after(order); - return original; - } - inline T fetch_inc(memory_order order) volatile - { - fence_before(order); - T original, tmp; - __asm__ __volatile__( - "1: lwarx %0,0,%2\n" - "addi %1,%0,1\n" - "stwcx. %1,0,%2\n" - "bne- 1b\n" - : "=&b" (original), "=&b" (tmp) - : "b" (&i) - : "cc"); - fence_after(order); - return original; - } - inline T fetch_dec(memory_order order) volatile - { - fence_before(order); - T original, tmp; - __asm__ __volatile__( - "1: lwarx %0,0,%2\n" - "addi %1,%0,-1\n" - "stwcx. %1,0,%2\n" - "bne- 1b\n" - : "=&b" (original), "=&b" (tmp) - : "b" (&i) - : "cc"); - fence_after(order); - return original; - } -private: - T i; + public: + typedef T integral_type; + explicit atomic_ppc_32(T v) : i(v) {} + atomic_ppc_32() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + __asm__ __volatile__( + "cmpw %0, %0\n" + "bne- 1f\n" + "1f:\n" + : "+b"(v)); + fence_after(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + fence_before(order); + *reinterpret_cast(&i) = v; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + int success; + __asm__ __volatile__( + "lwarx %0,0,%2\n" + "cmpw %0, %3\n" + "bne- 2f\n" + "stwcx. %4,0,%2\n" + "bne- 2f\n" + "addi %1,0,1\n" + "1:" + + ".subsection 2\n" + "2: addi %1,0,0\n" + "b 1b\n" + ".previous\n" + : "=&b"(expected), "=&b"(success) + : "b"(&i), "b"(expected), "b"((int)desired)); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + return success; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + inline T fetch_add_var(T c, memory_order order) volatile { + fence_before(order); + T original, tmp; + __asm__ __volatile__( + "1: lwarx %0,0,%2\n" + "add %1,%0,%3\n" + "stwcx. %1,0,%2\n" + "bne- 1b\n" + : "=&b"(original), "=&b"(tmp) + : "b"(&i), "b"(c) + : "cc"); + fence_after(order); + return original; + } + inline T fetch_inc(memory_order order) volatile { + fence_before(order); + T original, tmp; + __asm__ __volatile__( + "1: lwarx %0,0,%2\n" + "addi %1,%0,1\n" + "stwcx. %1,0,%2\n" + "bne- 1b\n" + : "=&b"(original), "=&b"(tmp) + : "b"(&i) + : "cc"); + fence_after(order); + return original; + } + inline T fetch_dec(memory_order order) volatile { + fence_before(order); + T original, tmp; + __asm__ __volatile__( + "1: lwarx %0,0,%2\n" + "addi %1,%0,-1\n" + "stwcx. %1,0,%2\n" + "bne- 1b\n" + : "=&b"(original), "=&b"(tmp) + : "b"(&i) + : "cc"); + fence_after(order); + return original; + } + + private: + T i; }; #if defined(__powerpc64__) #warning Untested code -- please inform me if it works -template +template class atomic_ppc_64 { -public: - typedef T integral_type; - explicit atomic_ppc_64(T v) : i(v) {} - atomic_ppc_64() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - __asm__ __volatile__ ( - "cmpw %0, %0\n" - "bne- 1f\n" - "1f:\n" - : "+b"(v)); - fence_after(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - fence_before(order); - *reinterpret_cast(&i)=v; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - int success; - __asm__ __volatile__( - "ldarx %0,0,%2\n" - "cmpw %0, %3\n" - "bne- 2f\n" - "stdcx. %4,0,%2\n" - "bne- 2f\n" - "addi %1,0,1\n" - "1:" - - ".subsection 2\n" - "2: addi %1,0,0\n" - "b 1b\n" - ".previous\n" - : "=&b" (expected), "=&b" (success) - : "b" (&i), "b" (expected), "b" ((int)desired) - ); - if (success) fence_after(success_order); - else fence_after(failure_order); - fence_after(order); - return success; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - inline T fetch_add_var(T c, memory_order order) volatile - { - fence_before(order); - T original, tmp; - __asm__ __volatile__( - "1: ldarx %0,0,%2\n" - "add %1,%0,%3\n" - "stdcx. %1,0,%2\n" - "bne- 1b\n" - : "=&b" (original), "=&b" (tmp) - : "b" (&i), "b" (c) - : "cc"); - fence_after(order); - return original; - } - inline T fetch_inc(memory_order order) volatile - { - fence_before(order); - T original, tmp; - __asm__ __volatile__( - "1: ldarx %0,0,%2\n" - "addi %1,%0,1\n" - "stdcx. %1,0,%2\n" - "bne- 1b\n" - : "=&b" (original), "=&b" (tmp) - : "b" (&i) - : "cc"); - fence_after(order); - return original; - } - inline T fetch_dec(memory_order order) volatile - { - fence_before(order); - T original, tmp; - __asm__ __volatile__( - "1: ldarx %0,0,%2\n" - "addi %1,%0,-1\n" - "stdcx. %1,0,%2\n" - "bne- 1b\n" - : "=&b" (original), "=&b" (tmp) - : "b" (&i) - : "cc"); - fence_after(order); - return original; - } -private: - T i; + public: + typedef T integral_type; + explicit atomic_ppc_64(T v) : i(v) {} + atomic_ppc_64() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + __asm__ __volatile__( + "cmpw %0, %0\n" + "bne- 1f\n" + "1f:\n" + : "+b"(v)); + fence_after(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + fence_before(order); + *reinterpret_cast(&i) = v; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + int success; + __asm__ __volatile__( + "ldarx %0,0,%2\n" + "cmpw %0, %3\n" + "bne- 2f\n" + "stdcx. %4,0,%2\n" + "bne- 2f\n" + "addi %1,0,1\n" + "1:" + + ".subsection 2\n" + "2: addi %1,0,0\n" + "b 1b\n" + ".previous\n" + : "=&b"(expected), "=&b"(success) + : "b"(&i), "b"(expected), "b"((int)desired)); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + fence_after(order); + return success; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + inline T fetch_add_var(T c, memory_order order) volatile { + fence_before(order); + T original, tmp; + __asm__ __volatile__( + "1: ldarx %0,0,%2\n" + "add %1,%0,%3\n" + "stdcx. %1,0,%2\n" + "bne- 1b\n" + : "=&b"(original), "=&b"(tmp) + : "b"(&i), "b"(c) + : "cc"); + fence_after(order); + return original; + } + inline T fetch_inc(memory_order order) volatile { + fence_before(order); + T original, tmp; + __asm__ __volatile__( + "1: ldarx %0,0,%2\n" + "addi %1,%0,1\n" + "stdcx. %1,0,%2\n" + "bne- 1b\n" + : "=&b"(original), "=&b"(tmp) + : "b"(&i) + : "cc"); + fence_after(order); + return original; + } + inline T fetch_dec(memory_order order) volatile { + fence_before(order); + T original, tmp; + __asm__ __volatile__( + "1: ldarx %0,0,%2\n" + "addi %1,%0,-1\n" + "stdcx. %1,0,%2\n" + "bne- 1b\n" + : "=&b"(original), "=&b"(tmp) + : "b"(&i) + : "cc"); + fence_after(order); + return original; + } + + private: + T i; }; #endif -template +template class platform_atomic_integral : public build_atomic_from_typical > > { -public: - typedef build_atomic_from_typical > > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_typical > > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #if defined(__powerpc64__) -template +template class platform_atomic_integral : public build_atomic_from_typical > > { -public: - typedef build_atomic_from_typical > > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_typical > > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #endif -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/gcc-x86.hpp b/external/atomic/boost/atomic/detail/gcc-x86.hpp index 02e88c86..f8238ff8 100644 --- a/external/atomic/boost/atomic/detail/gcc-x86.hpp +++ b/external/atomic/boost/atomic/detail/gcc-x86.hpp @@ -19,533 +19,522 @@ namespace boost { namespace detail { namespace atomic { -static inline void fence_before(memory_order order) -{ - switch(order) { - case memory_order_consume: - case memory_order_release: - case memory_order_acq_rel: - case memory_order_seq_cst: - __asm__ __volatile__ ("" ::: "memory"); - default:; - } +static inline void fence_before(memory_order order) { + switch (order) { + case memory_order_consume: + case memory_order_release: + case memory_order_acq_rel: + case memory_order_seq_cst: + __asm__ __volatile__("" ::: "memory"); + default:; + } } -static inline void fence_after(memory_order order) -{ - switch(order) { - case memory_order_acquire: - case memory_order_acq_rel: - case memory_order_seq_cst: - __asm__ __volatile__ ("" ::: "memory"); - default:; - } +static inline void fence_after(memory_order order) { + switch (order) { + case memory_order_acquire: + case memory_order_acq_rel: + case memory_order_seq_cst: + __asm__ __volatile__("" ::: "memory"); + default:; + } } -static inline void full_fence(void) -{ +static inline void full_fence(void) { #if __BOOST_AMD_64 - __asm__ __volatile__("mfence" ::: "memory"); + __asm__ __volatile__("mfence" ::: "memory"); #else - /* could use mfence iff i686, but it does not appear to matter much */ - __asm__ __volatile__("lock; addl $0, (%%esp)" ::: "memory"); + /* could use mfence iff i686, but it does not appear to matter much */ + __asm__ __volatile__("lock; addl $0, (%%esp)" ::: "memory"); #endif } -static inline void fence_after_load(memory_order order) -{ - switch(order) { - case memory_order_seq_cst: - full_fence(); - case memory_order_acquire: - case memory_order_acq_rel: - __asm__ __volatile__ ("" ::: "memory"); - default:; - } +static inline void fence_after_load(memory_order order) { + switch (order) { + case memory_order_seq_cst: + full_fence(); + case memory_order_acquire: + case memory_order_acq_rel: + __asm__ __volatile__("" ::: "memory"); + default:; + } } -template<> -inline void platform_atomic_thread_fence(memory_order order) -{ - switch(order) { - case memory_order_seq_cst: - full_fence(); - case memory_order_acquire: - case memory_order_consume: - case memory_order_acq_rel: - case memory_order_release: - __asm__ __volatile__ ("" ::: "memory"); - default:; - } +template <> +inline void platform_atomic_thread_fence(memory_order order) { + switch (order) { + case memory_order_seq_cst: + full_fence(); + case memory_order_acquire: + case memory_order_consume: + case memory_order_acq_rel: + case memory_order_release: + __asm__ __volatile__("" ::: "memory"); + default:; + } } -template +template class atomic_x86_8 { -public: - explicit atomic_x86_8(T v) : i(v) {} - atomic_x86_8() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - fence_before(order); - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - T prev=expected; - __asm__ __volatile__("lock; cmpxchgb %1, %2\n" : "=a" (prev) - : "q" (desired), "m" (i), "a" (expected) : "memory"); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order /*order*/=memory_order_seq_cst) volatile - { - __asm__ __volatile__("xchgb %0, %1\n" : "=q" (r) : "m"(i), "0" (r) : "memory"); - return r; - } - T fetch_add(T c, memory_order /*order*/=memory_order_seq_cst) volatile - { - __asm__ __volatile__("lock; xaddb %0, %1" : "+q" (c), "+m" (i) :: "memory"); - return c; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - typedef T integral_type; -private: - T i; + public: + explicit atomic_x86_8(T v) : i(v) {} + atomic_x86_8() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + fence_before(order); + *reinterpret_cast(&i) = v; + } else { + exchange(v); + } + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + T prev = expected; + __asm__ __volatile__("lock; cmpxchgb %1, %2\n" + : "=a"(prev) + : "q"(desired), "m"(i), "a"(expected) + : "memory"); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order /*order*/ = memory_order_seq_cst) volatile { + __asm__ __volatile__("xchgb %0, %1\n" + : "=q"(r) + : "m"(i), "0"(r) + : "memory"); + return r; + } + T fetch_add(T c, memory_order /*order*/ = memory_order_seq_cst) volatile { + __asm__ __volatile__("lock; xaddb %0, %1" : "+q"(c), "+m"(i)::"memory"); + return c; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + typedef T integral_type; + + private: + T i; }; -template -class platform_atomic_integral : public build_atomic_from_add > { -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} +template +class platform_atomic_integral + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class atomic_x86_16 { -public: - explicit atomic_x86_16(T v) : i(v) {} - atomic_x86_16() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - fence_before(order); - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - T prev=expected; - __asm__ __volatile__("lock; cmpxchgw %1, %2\n" : "=a" (prev) - : "q" (desired), "m" (i), "a" (expected) : "memory"); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - __asm__ __volatile__("xchgw %0, %1\n" : "=r" (r) : "m"(i), "0" (r) : "memory"); - return r; - } - T fetch_add(T c, memory_order order=memory_order_seq_cst) volatile - { - __asm__ __volatile__("lock; xaddw %0, %1" : "+r" (c), "+m" (i) :: "memory"); - return c; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - typedef T integral_type; -private: - T i; + public: + explicit atomic_x86_16(T v) : i(v) {} + atomic_x86_16() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + fence_before(order); + *reinterpret_cast(&i) = v; + } else { + exchange(v); + } + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + T prev = expected; + __asm__ __volatile__("lock; cmpxchgw %1, %2\n" + : "=a"(prev) + : "q"(desired), "m"(i), "a"(expected) + : "memory"); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + __asm__ __volatile__("xchgw %0, %1\n" + : "=r"(r) + : "m"(i), "0"(r) + : "memory"); + return r; + } + T fetch_add(T c, memory_order order = memory_order_seq_cst) volatile { + __asm__ __volatile__("lock; xaddw %0, %1" : "+r"(c), "+m"(i)::"memory"); + return c; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + typedef T integral_type; + + private: + T i; }; -template -class platform_atomic_integral : public build_atomic_from_add > { -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} +template +class platform_atomic_integral + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class atomic_x86_32 { -public: - explicit atomic_x86_32(T v) : i(v) {} - atomic_x86_32() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - fence_before(order); - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - T prev=expected; - __asm__ __volatile__("lock; cmpxchgl %1, %2\n" : "=a" (prev) - : "q" (desired), "m" (i), "a" (expected) : "memory"); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order /*order*/=memory_order_seq_cst) volatile - { - __asm__ __volatile__("xchgl %0, %1\n" : "=r" (r) : "m"(i), "0" (r) : "memory"); - return r; - } - T fetch_add(T c, memory_order /*order*/=memory_order_seq_cst) volatile - { - __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (c), "+m" (i) :: "memory"); - return c; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - typedef T integral_type; -private: - T i; + public: + explicit atomic_x86_32(T v) : i(v) {} + atomic_x86_32() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + fence_before(order); + *reinterpret_cast(&i) = v; + } else { + exchange(v); + } + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + T prev = expected; + __asm__ __volatile__("lock; cmpxchgl %1, %2\n" + : "=a"(prev) + : "q"(desired), "m"(i), "a"(expected) + : "memory"); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order /*order*/ = memory_order_seq_cst) volatile { + __asm__ __volatile__("xchgl %0, %1\n" + : "=r"(r) + : "m"(i), "0"(r) + : "memory"); + return r; + } + T fetch_add(T c, memory_order /*order*/ = memory_order_seq_cst) volatile { + __asm__ __volatile__("lock; xaddl %0, %1" : "+r"(c), "+m"(i)::"memory"); + return c; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + typedef T integral_type; + + private: + T i; }; -template -class platform_atomic_integral : public build_atomic_from_add > { -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} +template +class platform_atomic_integral + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #if __BOOST_AMD_64 -template +template class atomic_x86_64 { -public: - explicit atomic_x86_64(T v) : i(v) {} - atomic_x86_64() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - fence_before(order); - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - fence_before(success_order); - T prev=expected; - __asm__ __volatile__("lock; cmpxchgq %1, %2\n" : "=a" (prev) - : "q" (desired), "m" (i), "a" (expected) : "memory"); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order /*order*/=memory_order_seq_cst) volatile - { - __asm__ __volatile__("xchgq %0, %1\n" : "=r" (r) : "m"(i), "0" (r) : "memory"); - return r; - } - T fetch_add(T c, memory_order /*order*/=memory_order_seq_cst) volatile - { - __asm__ __volatile__("lock; xaddq %0, %1" : "+r" (c), "+m" (i) :: "memory"); - return c; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - typedef T integral_type; -private: - T i; + public: + explicit atomic_x86_64(T v) : i(v) {} + atomic_x86_64() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + fence_before(order); + *reinterpret_cast(&i) = v; + } else { + exchange(v); + } + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + fence_before(success_order); + T prev = expected; + __asm__ __volatile__("lock; cmpxchgq %1, %2\n" + : "=a"(prev) + : "q"(desired), "m"(i), "a"(expected) + : "memory"); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order /*order*/ = memory_order_seq_cst) volatile { + __asm__ __volatile__("xchgq %0, %1\n" + : "=r"(r) + : "m"(i), "0"(r) + : "memory"); + return r; + } + T fetch_add(T c, memory_order /*order*/ = memory_order_seq_cst) volatile { + __asm__ __volatile__("lock; xaddq %0, %1" : "+r"(c), "+m"(i)::"memory"); + return c; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + typedef T integral_type; + + private: + T i; } __attribute__((aligned(8))); #elif defined(__i686__) -template +template class atomic_x86_64 { -private: - typedef atomic_x86_64 this_type; -public: - explicit atomic_x86_64(T v) : i(v) {} - atomic_x86_64() {} - - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { + private: + typedef atomic_x86_64 this_type; + + public: + explicit atomic_x86_64(T v) : i(v) {} + atomic_x86_64() {} + + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { #ifdef __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 - T prev = __sync_val_compare_and_swap_8(&i, expected, desired); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; + T prev = __sync_val_compare_and_swap_8(&i, expected, desired); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; #else - long scratch; - fence_before(success_order); - T prev=expected; - /* Make sure ebx is saved and restored properly in case - this object is compiled as "position independent". Since - programmers on x86 tend to forget specifying -DPIC or - similar, always assume PIC. - - To make this work uniformly even in the non-PIC case, - setup register constraints such that ebx can not be - used by accident e.g. as base address for the variable - to be modified. Accessing "scratch" should always be okay, - as it can only be placed on the stack (and therefore - accessed through ebp or esp only). - - In theory, could push/pop ebx onto/off the stack, but movs - to a prepared stack slot turn out to be faster. */ - __asm__ __volatile__( - "movl %%ebx, %1\n" - "movl %2, %%ebx\n" - "lock; cmpxchg8b 0(%4)\n" - "movl %1, %%ebx\n" - : "=A" (prev), "=m" (scratch) - : "D" ((long)desired), "c" ((long)((std::uint64_t)desired>>32)), - "S" (&i), "0" (prev) - : "memory"); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; + long scratch; + fence_before(success_order); + T prev = expected; + /* Make sure ebx is saved and restored properly in case + this object is compiled as "position independent". Since + programmers on x86 tend to forget specifying -DPIC or + similar, always assume PIC. + + To make this work uniformly even in the non-PIC case, + setup register constraints such that ebx can not be + used by accident e.g. as base address for the variable + to be modified. Accessing "scratch" should always be okay, + as it can only be placed on the stack (and therefore + accessed through ebp or esp only). + + In theory, could push/pop ebx onto/off the stack, but movs + to a prepared stack slot turn out to be faster. */ + __asm__ __volatile__( + "movl %%ebx, %1\n" + "movl %2, %%ebx\n" + "lock; cmpxchg8b 0(%4)\n" + "movl %1, %%ebx\n" + : "=A"(prev), "=m"(scratch) + : "D"((long)desired), "c"((long)((std::uint64_t)desired >> 32)), + "S"(&i), "0"(prev) + : "memory"); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; #endif - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - T prev=i; - do {} while(!compare_exchange_strong(prev, r, order, memory_order_relaxed)); - return prev; - } - - T load(memory_order order=memory_order_seq_cst) const volatile - { - /* this is a bit problematic -- there is no other - way to atomically load a 64 bit value, but of course - compare_exchange requires write access to the memory - area */ - T expected=i; - do { } while(!const_cast(this)->compare_exchange_strong(expected, - expected, order, memory_order_relaxed)); - return expected; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - exchange(v, order); - } - T fetch_add(T c, memory_order order=memory_order_seq_cst) volatile - { - T expected=i, desired;; - do { - desired=expected+c; - } while(!compare_exchange_strong(expected, desired, - order, memory_order_relaxed)); - return expected; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - typedef T integral_type; -private: - T i; -} __attribute__((aligned(8))) ; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + T prev = i; + do { + } while (!compare_exchange_strong(prev, r, order, memory_order_relaxed)); + return prev; + } + + T load(memory_order order = memory_order_seq_cst) const volatile { + /* this is a bit problematic -- there is no other + way to atomically load a 64 bit value, but of course + compare_exchange requires write access to the memory + area */ + T expected = i; + do { + } while (!const_cast(this)->compare_exchange_strong( + expected, expected, order, memory_order_relaxed)); + return expected; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + exchange(v, order); + } + T fetch_add(T c, memory_order order = memory_order_seq_cst) volatile { + T expected = i, desired; + ; + do { + desired = expected + c; + } while (!compare_exchange_strong(expected, desired, order, + memory_order_relaxed)); + return expected; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + typedef T integral_type; + + private: + T i; +} __attribute__((aligned(8))); #endif #if __BOOST_AMD_64 || defined(__i686__) -template -class platform_atomic_integral : public build_atomic_from_add >{ -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} +template +class platform_atomic_integral + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #endif -#if __BOOST_AMD_64 && \ - defined(BOOST_ATOMIC_HAVE_SSE2) && \ - defined(BOOST_ATOMIC_HAVE_GNU_SYNC_16) && \ +#if __BOOST_AMD_64 && defined(BOOST_ATOMIC_HAVE_SSE2) && \ + defined(BOOST_ATOMIC_HAVE_GNU_SYNC_16) && \ defined(BOOST_ATOMIC_HAVE_GNU_ALIGNED_16) -template +template class atomic_x86_128 { -public: - explicit atomic_x86_128(T v) : i(v) {} - atomic_x86_128() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - fence_before(order); - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - T prev = __sync_val_compare_and_swap_16(&i, expected, desired); - bool success=(prev==expected); - if (success) fence_after(success_order); - else fence_after(failure_order); - expected=prev; - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - while (!__sync_bool_compare_and_swap_16(&i, i, r)) - {}; - - return r; - } - T fetch_add(T c, memory_order order=memory_order_seq_cst) volatile - { - __sync_fetch_and_add(&i, c); - return c; - } - - bool is_lock_free(void) const volatile {return true;} -protected: - typedef T integral_type; -private: - T i; + public: + explicit atomic_x86_128(T v) : i(v) {} + atomic_x86_128() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + fence_before(order); + *reinterpret_cast(&i) = v; + } else { + exchange(v); + } + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + T prev = __sync_val_compare_and_swap_16(&i, expected, desired); + bool success = (prev == expected); + if (success) + fence_after(success_order); + else + fence_after(failure_order); + expected = prev; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + while (!__sync_bool_compare_and_swap_16(&i, i, r)) { + }; + + return r; + } + T fetch_add(T c, memory_order order = memory_order_seq_cst) volatile { + __sync_fetch_and_add(&i, c); + return c; + } + + bool is_lock_free(void) const volatile { return true; } + + protected: + typedef T integral_type; + + private: + T i; } __attribute__((aligned(16))); -template -class platform_atomic_integral : public build_atomic_from_add >{ -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} +template +class platform_atomic_integral + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #endif -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #undef __BOOST_AMD_64 diff --git a/external/atomic/boost/atomic/detail/generic-cas.hpp b/external/atomic/boost/atomic/detail/generic-cas.hpp index 4ef66369..50d1f6a4 100644 --- a/external/atomic/boost/atomic/detail/generic-cas.hpp +++ b/external/atomic/boost/atomic/detail/generic-cas.hpp @@ -19,181 +19,194 @@ are fully fenced (full memory barriers before and after each operation) */ #if defined(__GNUC__) - namespace boost { namespace detail { namespace atomic { - static inline int32_t - fenced_compare_exchange_strong_32(volatile int32_t *ptr, - int32_t expected, int32_t desired) - { - return __sync_val_compare_and_swap_4(ptr, expected, desired); - } - #define BOOST_ATOMIC_HAVE_CAS32 1 - - #if (defined(__amd64__) || defined(__x86_64__)) || defined(__i686__) - static inline int64_t - fenced_compare_exchange_strong_64(int64_t *ptr, int64_t expected, int64_t desired) - { - return __sync_val_compare_and_swap_8(ptr, expected, desired); - } - #define BOOST_ATOMIC_HAVE_CAS64 1 - #endif - }}} +namespace boost { +namespace detail { +namespace atomic { +static inline int32_t fenced_compare_exchange_strong_32(volatile int32_t *ptr, + int32_t expected, + int32_t desired) { + return __sync_val_compare_and_swap_4(ptr, expected, desired); +} +#define BOOST_ATOMIC_HAVE_CAS32 1 + +#if (defined(__amd64__) || defined(__x86_64__)) || defined(__i686__) +static inline int64_t fenced_compare_exchange_strong_64(int64_t *ptr, + int64_t expected, + int64_t desired) { + return __sync_val_compare_and_swap_8(ptr, expected, desired); +} +#define BOOST_ATOMIC_HAVE_CAS64 1 +#endif +} // namespace atomic +} // namespace detail +} // namespace boost #elif defined(__ICL) || defined(_MSC_VER) - #if defined(_MSC_VER) - #include - #include - #endif - - namespace boost { namespace detail { namespace atomic { - static inline int32_t - fenced_compare_exchange_strong(int32_t *ptr, int32_t expected, int32_t desired) - { - return _InterlockedCompareExchange(reinterpret_cast(ptr), - desired, expected); - } - #define BOOST_ATOMIC_HAVE_CAS32 1 - #if defined(_WIN64) - static inline int64_t - fenced_compare_exchange_strong(int64_t *ptr, int64_t expected, int64_t desired) - { - return _InterlockedCompareExchange64(ptr, desired, expected); - } - #define BOOST_ATOMIC_HAVE_CAS64 1 - #endif - }}} +#if defined(_MSC_VER) +#include +#include +#endif + +namespace boost { +namespace detail { +namespace atomic { +static inline int32_t fenced_compare_exchange_strong(int32_t *ptr, + int32_t expected, + int32_t desired) { + return _InterlockedCompareExchange(reinterpret_cast(ptr), + desired, expected); +} +#define BOOST_ATOMIC_HAVE_CAS32 1 +#if defined(_WIN64) +static inline int64_t fenced_compare_exchange_strong(int64_t *ptr, + int64_t expected, + int64_t desired) { + return _InterlockedCompareExchange64(ptr, desired, expected); +} +#define BOOST_ATOMIC_HAVE_CAS64 1 +#endif +} // namespace atomic +} // namespace detail +} // namespace boost #elif (defined(__ICC) || defined(__ECC)) - namespace boost { namespace detail { namespace atomic { - static inline int32_t - fenced_compare_exchange_strong_32(int32_t *ptr, int32_t expected, int32_t desired) - { - return _InterlockedCompareExchange((void*)ptr, desired, expected); - } - #define BOOST_ATOMIC_HAVE_CAS32 1 - #if defined(__x86_64) - static inline int64_t - fenced_compare_exchange_strong(int64_t *ptr, int64_t expected, int64_t desired) - { - return cas64(ptr, expected, desired); - } - #define BOOST_ATOMIC_HAVE_CAS64 1 - #elif defined(__ECC) //IA-64 version - static inline int64_t - fenced_compare_exchange_strong(int64_t *ptr, int64_t expected, int64_t desired) - { - return _InterlockedCompareExchange64((void*)ptr, desired, expected); - } - #define BOOST_ATOMIC_HAVE_CAS64 1 - #endif - }}} +namespace boost { +namespace detail { +namespace atomic { +static inline int32_t fenced_compare_exchange_strong_32(int32_t *ptr, + int32_t expected, + int32_t desired) { + return _InterlockedCompareExchange((void *)ptr, desired, expected); +} +#define BOOST_ATOMIC_HAVE_CAS32 1 +#if defined(__x86_64) +static inline int64_t fenced_compare_exchange_strong(int64_t *ptr, + int64_t expected, + int64_t desired) { + return cas64(ptr, expected, desired); +} +#define BOOST_ATOMIC_HAVE_CAS64 1 +#elif defined(__ECC) // IA-64 version +static inline int64_t fenced_compare_exchange_strong(int64_t *ptr, + int64_t expected, + int64_t desired) { + return _InterlockedCompareExchange64((void *)ptr, desired, expected); +} +#define BOOST_ATOMIC_HAVE_CAS64 1 +#endif +} // namespace atomic +} // namespace detail +} // namespace boost #elif (defined(__SUNPRO_CC) && defined(__sparc)) - #include - namespace boost { namespace detail { namespace atomic { - static inline int32_t - fenced_compare_exchange_strong_32(int32_t *ptr, int32_t expected, int32_t desired) - { - return atomic_cas_32((volatile unsigned int*)ptr, expected, desired); - } - #define BOOST_ATOMIC_HAVE_CAS32 1 - - /* FIXME: check for 64 bit mode */ - static inline int64_t - fenced_compare_exchange_strong_64(int64_t *ptr, int64_t expected, int64_t desired) - { - return atomic_cas_64((volatile unsigned long long*)ptr, expected, desired); - } - #define BOOST_ATOMIC_HAVE_CAS64 1 - }}} +#include +namespace boost { +namespace detail { +namespace atomic { +static inline int32_t fenced_compare_exchange_strong_32(int32_t *ptr, + int32_t expected, + int32_t desired) { + return atomic_cas_32((volatile unsigned int *)ptr, expected, desired); +} +#define BOOST_ATOMIC_HAVE_CAS32 1 + +/* FIXME: check for 64 bit mode */ +static inline int64_t fenced_compare_exchange_strong_64(int64_t *ptr, + int64_t expected, + int64_t desired) { + return atomic_cas_64((volatile unsigned long long *)ptr, expected, desired); +} +#define BOOST_ATOMIC_HAVE_CAS64 1 +} // namespace atomic +} // namespace detail +} // namespace boost #endif - -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { #ifdef BOOST_ATOMIC_HAVE_CAS32 -template +template class atomic_generic_cas32 { -private: - typedef atomic_generic_cas32 this_type; -public: - explicit atomic_generic_cas32(T v) : i((int32_t)v) {} - atomic_generic_cas32() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T expected=(T)i; - do { } while(!const_cast(this)->compare_exchange_weak(expected, - expected, order, memory_order_relaxed)); - return expected; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - exchange(v); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - T found; - found=(T)fenced_compare_exchange_strong_32(&i, (int32_t)expected, - (int32_t)desired); - bool success=(found==expected); - expected=found; - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - T expected=(T)i; - do { } while(!compare_exchange_weak(expected, r, order, memory_order_relaxed)); - return expected; - } - - bool is_lock_free(void) const volatile {return true;} - typedef T integral_type; -private: - mutable int32_t i; + private: + typedef atomic_generic_cas32 this_type; + + public: + explicit atomic_generic_cas32(T v) : i((int32_t)v) {} + atomic_generic_cas32() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T expected = (T)i; + do { + } while (!const_cast(this)->compare_exchange_weak( + expected, expected, order, memory_order_relaxed)); + return expected; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + exchange(v); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + T found; + found = (T)fenced_compare_exchange_strong_32(&i, (int32_t)expected, + (int32_t)desired); + bool success = (found == expected); + expected = found; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + T expected = (T)i; + do { + } while (!compare_exchange_weak(expected, r, order, memory_order_relaxed)); + return expected; + } + + bool is_lock_free(void) const volatile { return true; } + typedef T integral_type; + + private: + mutable int32_t i; }; -template +template class platform_atomic_integral : public build_atomic_from_exchange > { -public: - typedef build_atomic_from_exchange > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_exchange > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #endif -} } } +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/integral-casts.hpp b/external/atomic/boost/atomic/detail/integral-casts.hpp index e17c3cf7..fb574978 100644 --- a/external/atomic/boost/atomic/detail/integral-casts.hpp +++ b/external/atomic/boost/atomic/detail/integral-casts.hpp @@ -12,473 +12,444 @@ #include -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { -template +template class platform_atomic : private platform_atomic_integral { -public: - typedef platform_atomic_integral super; + public: + typedef platform_atomic_integral super; #if defined(BOOST_ATOMIC_ENFORCE_PODNESS) - typedef union { T e; std::uint8_t i;} conv; + typedef union { + T e; + std::uint8_t i; + } conv; #endif - platform_atomic() {} - explicit platform_atomic(T t) : super(to_integral(t)) - { - } - - void store(T t, memory_order order=memory_order_seq_cst) volatile - { - super::store(to_integral(t), order); - } - T load(memory_order order=memory_order_seq_cst) volatile const - { - return from_integral(super::load(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint8_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_strong(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint8_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_weak(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - return from_integral(super::exchange(to_integral(replacement), order)); - } - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; -protected: - static inline std::uint8_t to_integral(T &t) - { - std::uint8_t tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } - static inline T from_integral(std::uint8_t t) - { - T tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } + platform_atomic() {} + explicit platform_atomic(T t) : super(to_integral(t)) {} + + void store(T t, memory_order order = memory_order_seq_cst) volatile { + super::store(to_integral(t), order); + } + T load(memory_order order = memory_order_seq_cst) volatile const { + return from_integral(super::load(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + std::uint8_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_strong(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + std::uint8_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_weak(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + return from_integral(super::exchange(to_integral(replacement), order)); + } + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::is_lock_free; + + protected: + static inline std::uint8_t to_integral(T &t) { + std::uint8_t tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } + static inline T from_integral(std::uint8_t t) { + T tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } }; -template +template class platform_atomic : private platform_atomic_integral { -public: - typedef platform_atomic_integral super; + public: + typedef platform_atomic_integral super; #if defined(BOOST_ATOMIC_ENFORCE_PODNESS) - typedef union { T e; std::uint16_t i;} conv; + typedef union { + T e; + std::uint16_t i; + } conv; #endif - platform_atomic() {} - explicit platform_atomic(T t) : super(to_integral(t)) - { - } - - void store(T t, memory_order order=memory_order_seq_cst) volatile - { - super::store(to_integral(t), order); - } - T load(memory_order order=memory_order_seq_cst) volatile const - { - return from_integral(super::load(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint16_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_strong(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint16_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_weak(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - return from_integral(super::exchange(to_integral(replacement), order)); - } - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; -protected: - static inline std::uint16_t to_integral(T &t) - { - std::uint16_t tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } - static inline T from_integral(std::uint16_t t) - { - T tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } + platform_atomic() {} + explicit platform_atomic(T t) : super(to_integral(t)) {} + + void store(T t, memory_order order = memory_order_seq_cst) volatile { + super::store(to_integral(t), order); + } + T load(memory_order order = memory_order_seq_cst) volatile const { + return from_integral(super::load(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + std::uint16_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_strong(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + std::uint16_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_weak(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + return from_integral(super::exchange(to_integral(replacement), order)); + } + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::is_lock_free; + + protected: + static inline std::uint16_t to_integral(T &t) { + std::uint16_t tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } + static inline T from_integral(std::uint16_t t) { + T tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } }; -template +template class platform_atomic : private platform_atomic_integral { -public: - typedef platform_atomic_integral super; + public: + typedef platform_atomic_integral super; #if defined(BOOST_ATOMIC_ENFORCE_PODNESS) - typedef union { T e; std::uint32_t i;} conv; + typedef union { + T e; + std::uint32_t i; + } conv; #endif - platform_atomic() {} - explicit platform_atomic(T t) : super(to_integral(t)) - { - } - - void store(T t, memory_order order=memory_order_seq_cst) volatile - { - super::store(to_integral(t), order); - } - T load(memory_order order=memory_order_seq_cst) volatile const - { - return from_integral(super::load(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint32_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_strong(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint32_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_weak(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - return from_integral(super::exchange(to_integral(replacement), order)); - } - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; -protected: - static inline std::uint32_t to_integral(T &t) - { - std::uint32_t tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } - static inline T from_integral(std::uint32_t t) - { - T tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } + platform_atomic() {} + explicit platform_atomic(T t) : super(to_integral(t)) {} + + void store(T t, memory_order order = memory_order_seq_cst) volatile { + super::store(to_integral(t), order); + } + T load(memory_order order = memory_order_seq_cst) volatile const { + return from_integral(super::load(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + std::uint32_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_strong(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + std::uint32_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_weak(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + return from_integral(super::exchange(to_integral(replacement), order)); + } + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::is_lock_free; + + protected: + static inline std::uint32_t to_integral(T &t) { + std::uint32_t tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } + static inline T from_integral(std::uint32_t t) { + T tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } }; -template +template class platform_atomic : private platform_atomic_integral { -public: - typedef platform_atomic_integral super; + public: + typedef platform_atomic_integral super; #if defined(BOOST_ATOMIC_ENFORCE_PODNESS) - typedef union { T e; std::uint64_t i;} conv; + typedef union { + T e; + std::uint64_t i; + } conv; #endif - platform_atomic() {} - explicit platform_atomic(T t) : super(to_integral(t)) - { - } - - void store(T t, memory_order order=memory_order_seq_cst) volatile - { - super::store(to_integral(t), order); - } - T load(memory_order order=memory_order_seq_cst) volatile const - { - return from_integral(super::load(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint64_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_strong(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::uint64_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_weak(_expected, _desired, success_order, - failure_order); - expected=from_integral(_expected); - return success; - } - - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - return from_integral(super::exchange(to_integral(replacement), order)); - } - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; -protected: - static inline std::uint64_t to_integral(T &t) - { - std::uint64_t tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } - static inline T from_integral(std::uint64_t t) - { - T tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } + platform_atomic() {} + explicit platform_atomic(T t) : super(to_integral(t)) {} + + void store(T t, memory_order order = memory_order_seq_cst) volatile { + super::store(to_integral(t), order); + } + T load(memory_order order = memory_order_seq_cst) volatile const { + return from_integral(super::load(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + std::uint64_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_strong(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + std::uint64_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_weak(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + return from_integral(super::exchange(to_integral(replacement), order)); + } + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::is_lock_free; + + protected: + static inline std::uint64_t to_integral(T &t) { + std::uint64_t tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } + static inline T from_integral(std::uint64_t t) { + T tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } }; #if (defined(__amd64__) || defined(__x86_64__)) && \ - defined(BOOST_ATOMIC_HAVE_SSE2) && \ - defined(BOOST_ATOMIC_HAVE_GNU_SYNC_16) && \ - defined(BOOST_ATOMIC_HAVE_GNU_ALIGNED_16) && \ + defined(BOOST_ATOMIC_HAVE_SSE2) && \ + defined(BOOST_ATOMIC_HAVE_GNU_SYNC_16) && \ + defined(BOOST_ATOMIC_HAVE_GNU_ALIGNED_16) && \ defined(BOOST_ATOMIC_HAVE_GNU_128BIT_INTEGERS) #define BOOST_ATOMIC_HAVE_128BIT_SUPPORT -template +template class platform_atomic : private platform_atomic_integral<__uint128_t> { -public: - typedef platform_atomic_integral<__uint128_t> super; + public: + typedef platform_atomic_integral<__uint128_t> super; #if defined(BOOST_ATOMIC_ENFORCE_PODNESS) - typedef union { T e; __uint128_t i;} conv; + typedef union { + T e; + __uint128_t i; + } conv; #endif - platform_atomic() {} - explicit platform_atomic(T t) : super(to_integral(t)) - { - } - - void store(T t, memory_order order=memory_order_seq_cst) volatile - { - super::store(to_integral(t), order); - } - T load(memory_order order=memory_order_seq_cst) volatile const - { - return from_integral(super::load(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - __uint128_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_strong(_expected, _desired, - success_order, failure_order); - expected=from_integral(_expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - __uint128_t _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_weak(_expected, _desired, - success_order, failure_order); - expected=from_integral(_expected); - return success; - } - - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - return from_integral(super::exchange(to_integral(replacement), order)); - } - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; -protected: - static inline __uint128_t to_integral(T &t) - { - __uint128_t tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } - static inline T from_integral(__uint128_t t) - { - T tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } + platform_atomic() {} + explicit platform_atomic(T t) : super(to_integral(t)) {} + + void store(T t, memory_order order = memory_order_seq_cst) volatile { + super::store(to_integral(t), order); + } + T load(memory_order order = memory_order_seq_cst) volatile const { + return from_integral(super::load(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + __uint128_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_strong(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + __uint128_t _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_weak(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + return from_integral(super::exchange(to_integral(replacement), order)); + } + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::is_lock_free; + + protected: + static inline __uint128_t to_integral(T &t) { + __uint128_t tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } + static inline T from_integral(__uint128_t t) { + T tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } }; #elif BOOST_MSVC >= 1500 && (defined(_M_IA64) || defined(_M_AMD64)) && \ - defined(BOOST_ATOMIC_HAVE_SSE2) + defined(BOOST_ATOMIC_HAVE_SSE2) #define BOOST_ATOMIC_HAVE_128BIT_SUPPORT - -}}} +} +} +} #include -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { -template +template class platform_atomic : private platform_atomic_integral<__m128i> { -public: - typedef platform_atomic_integral<__m128i> super; + public: + typedef platform_atomic_integral<__m128i> super; #if defined(BOOST_ATOMIC_ENFORCE_PODNESS) - typedef union { T e; __m128i i;} conv; + typedef union { + T e; + __m128i i; + } conv; #endif - platform_atomic() {} - explicit platform_atomic(T t) : super(to_integral(t)) - { - } - - void store(T t, memory_order order=memory_order_seq_cst) volatile - { - super::store(to_integral(t), order); - } - T load(memory_order order=memory_order_seq_cst) volatile const - { - return from_integral(super::load(order)); - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - __m128i _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_strong(_expected, _desired, - success_order, failure_order); - expected=from_integral(_expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - __m128i _expected, _desired; - _expected=to_integral(expected); - _desired=to_integral(desired); - bool success=super::compare_exchange_weak(_expected, _desired, - success_order, failure_order); - expected=from_integral(_expected); - return success; - } - - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - return from_integral(super::exchange(to_integral(replacement), order)); - } - - operator T(void) const volatile {return load();} - T operator=(T v) volatile {store(v); return v;} - - using super::is_lock_free; -protected: - static inline __m128i to_integral(T &t) - { - __m128i tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } - static inline T from_integral(__m128i t) - { - T tmp; - memcpy(&tmp, &t, sizeof(t)); - return tmp; - } + platform_atomic() {} + explicit platform_atomic(T t) : super(to_integral(t)) {} + + void store(T t, memory_order order = memory_order_seq_cst) volatile { + super::store(to_integral(t), order); + } + T load(memory_order order = memory_order_seq_cst) volatile const { + return from_integral(super::load(order)); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + __m128i _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_strong(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + __m128i _expected, _desired; + _expected = to_integral(expected); + _desired = to_integral(desired); + bool success = super::compare_exchange_weak(_expected, _desired, + success_order, failure_order); + expected = from_integral(_expected); + return success; + } + + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + return from_integral(super::exchange(to_integral(replacement), order)); + } + + operator T(void) const volatile { return load(); } + T operator=(T v) volatile { + store(v); + return v; + } + + using super::is_lock_free; + + protected: + static inline __m128i to_integral(T &t) { + __m128i tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } + static inline T from_integral(__m128i t) { + T tmp; + memcpy(&tmp, &t, sizeof(t)); + return tmp; + } }; #endif -} } } +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/interlocked.hpp b/external/atomic/boost/atomic/detail/interlocked.hpp index 9196f190..3acc73e9 100644 --- a/external/atomic/boost/atomic/detail/interlocked.hpp +++ b/external/atomic/boost/atomic/detail/interlocked.hpp @@ -16,358 +16,338 @@ #include -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { -static inline void full_fence(void) -{ - long tmp; - BOOST_INTERLOCKED_EXCHANGE(&tmp, 0); +static inline void full_fence(void) { + long tmp; + BOOST_INTERLOCKED_EXCHANGE(&tmp, 0); } -template<> -inline void platform_atomic_thread_fence(memory_order order) -{ - switch(order) { - case memory_order_seq_cst: - full_fence(); - default:; - } +template <> +inline void platform_atomic_thread_fence(memory_order order) { + switch (order) { + case memory_order_seq_cst: + full_fence(); + default:; + } } -static inline void fence_after_load(memory_order order) -{ - switch(order) { - case memory_order_seq_cst: - full_fence(); - case memory_order_acquire: - case memory_order_acq_rel: - default:; - } +static inline void fence_after_load(memory_order order) { + switch (order) { + case memory_order_seq_cst: + full_fence(); + case memory_order_acquire: + case memory_order_acq_rel: + default:; + } } - -template +template class atomic_interlocked_32 { -public: - explicit atomic_interlocked_32(T v) : i(v) {} - atomic_interlocked_32() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - T prev=expected; - expected=(T)BOOST_INTERLOCKED_COMPARE_EXCHANGE((long *)(&i), - (long)desired, (long)expected); - bool success=(prev==expected); - return success; + public: + explicit atomic_interlocked_32(T v) : i(v) {} + atomic_interlocked_32() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + *reinterpret_cast(&i) = v; + } else { + exchange(v); } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - return (T)BOOST_INTERLOCKED_EXCHANGE((long *)&i, (long)r); - } - T fetch_add(T c, memory_order order=memory_order_seq_cst) volatile - { - return (T)BOOST_INTERLOCKED_EXCHANGE_ADD((long *)&i, c); - } - - bool is_lock_free(void) const volatile {return true;} - - typedef T integral_type; -private: - T i; + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + T prev = expected; + expected = (T)BOOST_INTERLOCKED_COMPARE_EXCHANGE( + (long *)(&i), (long)desired, (long)expected); + bool success = (prev == expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + return (T)BOOST_INTERLOCKED_EXCHANGE((long *)&i, (long)r); + } + T fetch_add(T c, memory_order order = memory_order_seq_cst) volatile { + return (T)BOOST_INTERLOCKED_EXCHANGE_ADD((long *)&i, c); + } + + bool is_lock_free(void) const volatile { return true; } + + typedef T integral_type; + + private: + T i; }; -}}} +} // namespace atomic +} // namespace detail +} // namespace boost -# if defined(_M_IA64) || defined(_M_AMD64) +#if defined(_M_IA64) || defined(_M_AMD64) -#if defined( BOOST_USE_WINDOWS_H ) +#if defined(BOOST_USE_WINDOWS_H) -# include +#include -# define BOOST_INTERLOCKED_EXCHANGE_ADD64 InterlockedExchangeAdd64 -# define BOOST_INTERLOCKED_EXCHANGE64 InterlockedExchange64 -# define BOOST_INTERLOCKED_COMPARE_EXCHANGE64 InterlockedCompareExchange64 +#define BOOST_INTERLOCKED_EXCHANGE_ADD64 InterlockedExchangeAdd64 +#define BOOST_INTERLOCKED_EXCHANGE64 InterlockedExchange64 +#define BOOST_INTERLOCKED_COMPARE_EXCHANGE64 InterlockedCompareExchange64 #else -extern "C" std::int64_t __cdecl _InterlockedExchangeAdd64(std::int64_t volatile *, - std::int64_t); +extern "C" std::int64_t __cdecl _InterlockedExchangeAdd64( + std::int64_t volatile *, std::int64_t); extern "C" std::int64_t __cdecl _InterlockedExchange64(std::int64_t volatile *, - std::int64_t); + std::int64_t); extern "C" std::int64_t __cdecl _InterlockedCompareExchange64( std::int64_t volatile *, std::int64_t, std::int64_t); -# pragma intrinsic( _InterlockedExchangeAdd64 ) -# pragma intrinsic( _InterlockedExchange64 ) -# pragma intrinsic( _InterlockedCompareExchange64 ) +#pragma intrinsic(_InterlockedExchangeAdd64) +#pragma intrinsic(_InterlockedExchange64) +#pragma intrinsic(_InterlockedCompareExchange64) -# define BOOST_INTERLOCKED_EXCHANGE_ADD64 _InterlockedExchangeAdd64 -# define BOOST_INTERLOCKED_EXCHANGE64 _InterlockedExchange64 -# define BOOST_INTERLOCKED_COMPARE_EXCHANGE64 _InterlockedCompareExchange64 +#define BOOST_INTERLOCKED_EXCHANGE_ADD64 _InterlockedExchangeAdd64 +#define BOOST_INTERLOCKED_EXCHANGE64 _InterlockedExchange64 +#define BOOST_INTERLOCKED_COMPARE_EXCHANGE64 _InterlockedCompareExchange64 #endif -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { -template +template class __declspec(align(8)) atomic_interlocked_64 { -public: - explicit atomic_interlocked_64(T v) : i(v) {} - atomic_interlocked_64() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=*reinterpret_cast(&i); - fence_after_load(order); - return v; + public: + explicit atomic_interlocked_64(T v) : i(v) {} + atomic_interlocked_64() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = *reinterpret_cast(&i); + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + *reinterpret_cast(&i) = v; + } else { + exchange(v); } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - *reinterpret_cast(&i)=v; - } else { - exchange(v); - } - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - T prev=expected; - expected=(T)BOOST_INTERLOCKED_COMPARE_EXCHANGE64((std::int64_t *)(&i), - (std::int64_t)desired, (std::int64_t)expected); - bool success=(prev==expected); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); - } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - return (T)BOOST_INTERLOCKED_EXCHANGE64((std::int64_t *)&i, (std::int64_t)r); - } - T fetch_add(T c, memory_order order=memory_order_seq_cst) volatile - { - return (T)BOOST_INTERLOCKED_EXCHANGE_ADD64((std::int64_t *)&i, c); - } - - bool is_lock_free(void) const volatile {return true;} - - typedef T integral_type; -private: - T i; + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + T prev = expected; + expected = (T)BOOST_INTERLOCKED_COMPARE_EXCHANGE64( + (std::int64_t *)(&i), (std::int64_t)desired, (std::int64_t)expected); + bool success = (prev == expected); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + return (T)BOOST_INTERLOCKED_EXCHANGE64((std::int64_t *)&i, (std::int64_t)r); + } + T fetch_add(T c, memory_order order = memory_order_seq_cst) volatile { + return (T)BOOST_INTERLOCKED_EXCHANGE_ADD64((std::int64_t *)&i, c); + } + + bool is_lock_free(void) const volatile { return true; } + + typedef T integral_type; + + private: + T i; }; -}}} +} // namespace atomic +} // namespace detail +} // namespace boost // _InterlockedCompareExchange128 is available only starting with VS2008 #if BOOST_MSVC >= 1500 && defined(BOOST_ATOMIC_HAVE_SSE2) -# include +#include extern "C" unsigned char __cdecl _InterlockedCompareExchange128( - std::int64_t volatile *Destination, - std::int64_t ExchangeHigh, std::int64_t ExchangeLow, - std::int64_t *Comparand); -extern "C" __m128i _mm_load_si128(__m128i const*_P); + std::int64_t volatile *Destination, std::int64_t ExchangeHigh, + std::int64_t ExchangeLow, std::int64_t *Comparand); +extern "C" __m128i _mm_load_si128(__m128i const *_P); extern "C" void _mm_store_si128(__m128i *_P, __m128i _B); -# pragma intrinsic( _InterlockedCompareExchange128 ) -# pragma intrinsic( _mm_load_si128 ) -# pragma intrinsic( _mm_store_si128 ) +#pragma intrinsic(_InterlockedCompareExchange128) +#pragma intrinsic(_mm_load_si128) +#pragma intrinsic(_mm_store_si128) -# define BOOST_INTERLOCKED_COMPARE_EXCHANGE128 _InterlockedCompareExchange128 +#define BOOST_INTERLOCKED_COMPARE_EXCHANGE128 _InterlockedCompareExchange128 -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { -template +template class __declspec(align(16)) atomic_interlocked_128 { -public: - explicit atomic_interlocked_128(T v) : i(v) {} - atomic_interlocked_128() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v; - if (order!=memory_order_seq_cst) { - v = *(T const*)(&i); - } - else { - v = _mm_load_si128((__m128i const*)(&i)); - } - fence_after_load(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - if (order!=memory_order_seq_cst) { - *(T*)(&i)=v; - } - else { - _mm_store_si128(*(__m128i*)(&i), v); - } + public: + explicit atomic_interlocked_128(T v) : i(v) {} + atomic_interlocked_128() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v; + if (order != memory_order_seq_cst) { + v = *(T const *)(&i); + } else { + v = _mm_load_si128((__m128i const *)(&i)); } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - std::int64_t* desired_raw = (std::int64_t*)&desired; - T prev = *(__m128i*)(&i); - bool success = BOOST_INTERLOCKED_COMPARE_EXCHANGE128( - (std::int64_t volatile *)(&i), - desired_raw[1], desired_raw[0], (std::int64_t*)&expected) != 0; - if (!success) - expected = prev; - return success; + fence_after_load(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + if (order != memory_order_seq_cst) { + *(T *)(&i) = v; + } else { + _mm_store_si128(*(__m128i *)(&i), v); } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, failure_order); + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + std::int64_t *desired_raw = (std::int64_t *)&desired; + T prev = *(__m128i *)(&i); + bool success = BOOST_INTERLOCKED_COMPARE_EXCHANGE128( + (std::int64_t volatile *)(&i), desired_raw[1], + desired_raw[0], (std::int64_t *)&expected) != 0; + if (!success) expected = prev; + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T r, memory_order order = memory_order_seq_cst) volatile { + std::int64_t *desired_raw = (std::int64_t *)&r; + T prev = i; + + while (!BOOST_INTERLOCKED_COMPARE_EXCHANGE128( + (std::int64_t volatile *)&i, desired_raw[1], desired_raw[0], + (std::int64_t *)&i)) { } - T exchange(T r, memory_order order=memory_order_seq_cst) volatile - { - std::int64_t* desired_raw = (std::int64_t*)&r; - T prev = i; - while (!BOOST_INTERLOCKED_COMPARE_EXCHANGE128( - (std::int64_t volatile*)&i, desired_raw[1], desired_raw[0], - (std::int64_t*)&i)) - {} + return prev; + } + T fetch_add(T c, memory_order order = memory_order_seq_cst) volatile { + T expected = i; + __m128i desired; - return prev; - } - T fetch_add(T c, memory_order order=memory_order_seq_cst) volatile - { - T expected = i; - __m128i desired; + do { + desired = _mm_add_epi32(*(__m128i *)(&expected), *(__m128i *)(&c)); + } while (!compare_exchange_strong(expected, *(T *)(&desired), order, + memory_order_relaxed)); - do { - desired = _mm_add_epi32(*(__m128i*)(&expected), *(__m128i*)(&c)); - } while (!compare_exchange_strong(expected, *(T*)(&desired), - order, memory_order_relaxed)); + return expected; + } - return expected; - } + bool is_lock_free(void) const volatile { return true; } - bool is_lock_free(void) const volatile {return true;} + typedef T integral_type; - typedef T integral_type; -private: - T i; + private: + T i; }; -}}} +} // namespace atomic +} // namespace detail +} // namespace boost #endif #endif -namespace boost { namespace detail { namespace atomic { +namespace boost { +namespace detail { +namespace atomic { -template +template class platform_atomic_integral : public build_atomic_from_add > { -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> + super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template +template class platform_atomic_integral : public build_atomic_from_larger_type, T> { -public: - typedef build_atomic_from_larger_type, T> super; + public: + typedef build_atomic_from_larger_type, T> + super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -# if defined(_M_IA64) || defined(_M_AMD64) -template +#if defined(_M_IA64) || defined(_M_AMD64) +template class platform_atomic_integral - : public build_atomic_from_add > -{ -public: - typedef build_atomic_from_add > super; + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; -template<> -class platform_atomic_integral - : public build_atomic_from_add > -{ -public: - typedef build_atomic_from_add > super; +template <> +class platform_atomic_integral + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; - explicit platform_atomic_integral(void* v) : super(v) {} - platform_atomic_integral(void) {} + explicit platform_atomic_integral(void *v) : super(v) {} + platform_atomic_integral(void) {} }; #if BOOST_MSVC >= 1500 && defined(BOOST_ATOMIC_HAVE_SSE2) -template +template class platform_atomic_integral - : public build_atomic_from_add > -{ -public: - typedef build_atomic_from_add > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + : public build_atomic_from_add > { + public: + typedef build_atomic_from_add > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; #endif #endif -}}} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/linux-arm.hpp b/external/atomic/boost/atomic/detail/linux-arm.hpp index d6eb068d..2df045b8 100644 --- a/external/atomic/boost/atomic/detail/linux-arm.hpp +++ b/external/atomic/boost/atomic/detail/linux-arm.hpp @@ -17,7 +17,6 @@ namespace boost { namespace detail { namespace atomic { - // Different ARM processors have different atomic instructions. In particular, // architecture versions before v6 (which are still in widespread use, e.g. the // Intel/Marvell XScale chips like the one in the NSLU2) have only atomic swap. @@ -30,145 +29,128 @@ namespace atomic { // For documentation, see arch/arm/kernel/entry-armv.S in the kernel source // (search for "User Helpers"). - -typedef void (kernel_dmb_t)(void); +typedef void(kernel_dmb_t)(void); #define BOOST_ATOMIC_KERNEL_DMB (*(kernel_dmb_t *)0xffff0fa0) -static inline void fence_before(memory_order order) -{ - switch(order) { - // FIXME I really don't know which of these cases should call - // kernel_dmb() and which shouldn't... - case memory_order_consume: - case memory_order_release: - case memory_order_acq_rel: - case memory_order_seq_cst: - BOOST_ATOMIC_KERNEL_DMB(); - default:; - } +static inline void fence_before(memory_order order) { + switch (order) { + // FIXME I really don't know which of these cases should call + // kernel_dmb() and which shouldn't... + case memory_order_consume: + case memory_order_release: + case memory_order_acq_rel: + case memory_order_seq_cst: + BOOST_ATOMIC_KERNEL_DMB(); + default:; + } } -static inline void fence_after(memory_order order) -{ - switch(order) { - // FIXME I really don't know which of these cases should call - // kernel_dmb() and which shouldn't... - case memory_order_acquire: - case memory_order_acq_rel: - case memory_order_seq_cst: - BOOST_ATOMIC_KERNEL_DMB(); - default:; - } +static inline void fence_after(memory_order order) { + switch (order) { + // FIXME I really don't know which of these cases should call + // kernel_dmb() and which shouldn't... + case memory_order_acquire: + case memory_order_acq_rel: + case memory_order_seq_cst: + BOOST_ATOMIC_KERNEL_DMB(); + default:; + } } #undef BOOST_ATOMIC_KERNEL_DMB - -template +template class atomic_linux_arm_4 { - - typedef int (kernel_cmpxchg_t)(T oldval, T newval, volatile T *ptr); -# define BOOST_ATOMIC_KERNEL_CMPXCHG (*(kernel_cmpxchg_t *)0xffff0fc0) - // Returns 0 if *ptr was changed. - -public: - explicit atomic_linux_arm_4(T v) : i(v) {} - atomic_linux_arm_4() {} - T load(memory_order order=memory_order_seq_cst) const volatile - { - T v=const_cast(i); - fence_after(order); - return v; - } - void store(T v, memory_order order=memory_order_seq_cst) volatile - { - fence_before(order); - const_cast(i)=v; - } - bool compare_exchange_strong( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - // Aparently we can consider kernel_cmpxchg to be strong if it is retried - // by the kernel after being interrupted, which I think it is. - // Also it seems that when an ll/sc implementation is used the kernel - // loops until the store succeeds. - bool success = BOOST_ATOMIC_KERNEL_CMPXCHG(expected,desired,&i)==0; - if (!success) expected = load(memory_order_relaxed); - return success; - } - bool compare_exchange_weak( - T &expected, - T desired, - memory_order success_order, - memory_order failure_order) volatile - { - return compare_exchange_strong(expected, desired, success_order, - failure_order); - } - T exchange(T replacement, memory_order order=memory_order_seq_cst) volatile - { - // Copied from build_exchange. - T o=load(memory_order_relaxed); - do {} while(!compare_exchange_weak(o, replacement, order, order)); - return o; - // Note that ARM has an atomic swap instruction that we could use here: - // T oldval; - // asm volatile ("swp\t%0, %1, [%2]" : "=&r"(oldval) : "r" (replacement), - // "r" (&i) : "memory"); - // return oldval; - // This instruction is deprecated in architecture >= 6. I'm unsure - // how inefficient its implementation is on those newer architectures. - // I don't think this would gain much since exchange() is not used often. - } - - bool is_lock_free(void) const volatile {return true;} - - typedef T integral_type; -private: - T i; - -# undef BOOST_ATOMIC_KERNEL_CMPXCHG - + typedef int(kernel_cmpxchg_t)(T oldval, T newval, volatile T *ptr); +#define BOOST_ATOMIC_KERNEL_CMPXCHG (*(kernel_cmpxchg_t *)0xffff0fc0) + // Returns 0 if *ptr was changed. + + public: + explicit atomic_linux_arm_4(T v) : i(v) {} + atomic_linux_arm_4() {} + T load(memory_order order = memory_order_seq_cst) const volatile { + T v = const_cast(i); + fence_after(order); + return v; + } + void store(T v, memory_order order = memory_order_seq_cst) volatile { + fence_before(order); + const_cast(i) = v; + } + bool compare_exchange_strong(T &expected, T desired, + memory_order success_order, + memory_order failure_order) volatile { + // Aparently we can consider kernel_cmpxchg to be strong if it is retried + // by the kernel after being interrupted, which I think it is. + // Also it seems that when an ll/sc implementation is used the kernel + // loops until the store succeeds. + bool success = BOOST_ATOMIC_KERNEL_CMPXCHG(expected, desired, &i) == 0; + if (!success) expected = load(memory_order_relaxed); + return success; + } + bool compare_exchange_weak(T &expected, T desired, memory_order success_order, + memory_order failure_order) volatile { + return compare_exchange_strong(expected, desired, success_order, + failure_order); + } + T exchange(T replacement, + memory_order order = memory_order_seq_cst) volatile { + // Copied from build_exchange. + T o = load(memory_order_relaxed); + do { + } while (!compare_exchange_weak(o, replacement, order, order)); + return o; + // Note that ARM has an atomic swap instruction that we could use here: + // T oldval; + // asm volatile ("swp\t%0, %1, [%2]" : "=&r"(oldval) : "r" (replacement), + // "r" (&i) : "memory"); + // return oldval; + // This instruction is deprecated in architecture >= 6. I'm unsure + // how inefficient its implementation is on those newer architectures. + // I don't think this would gain much since exchange() is not used often. + } + + bool is_lock_free(void) const volatile { return true; } + + typedef T integral_type; + + private: + T i; + +#undef BOOST_ATOMIC_KERNEL_CMPXCHG }; -template +template class platform_atomic_integral : public build_atomic_from_exchange > { -public: - typedef build_atomic_from_exchange > super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + public: + typedef build_atomic_from_exchange > super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; - -template +template class platform_atomic_integral - : public build_atomic_from_larger_type, T > { -public: - typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + : public build_atomic_from_larger_type, T> { + public: + typedef build_atomic_from_larger_type, T> super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; - -template +template class platform_atomic_integral - : public build_atomic_from_larger_type, T > { -public: - typedef build_atomic_from_larger_type, T> super; - explicit platform_atomic_integral(T v) : super(v) {} - platform_atomic_integral(void) {} + : public build_atomic_from_larger_type, T> { + public: + typedef build_atomic_from_larger_type, T> super; + explicit platform_atomic_integral(T v) : super(v) {} + platform_atomic_integral(void) {} }; - typedef atomic_linux_arm_4 platform_atomic_address; - -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/detail/valid_integral_types.hpp b/external/atomic/boost/atomic/detail/valid_integral_types.hpp index 2a62614a..d2b12926 100644 --- a/external/atomic/boost/atomic/detail/valid_integral_types.hpp +++ b/external/atomic/boost/atomic/detail/valid_integral_types.hpp @@ -13,33 +13,78 @@ namespace boost { namespace detail { namespace atomic { -template struct is_integral_type {typedef void test;}; +template +struct is_integral_type { + typedef void test; +}; -template<> struct is_integral_type {typedef int test;}; +template <> +struct is_integral_type { + typedef int test; +}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; #ifdef BOOST_HAS_LONG_LONG -template<> struct is_integral_type {typedef int test;}; -template<> struct is_integral_type {typedef int test;}; +template <> +struct is_integral_type { + typedef int test; +}; +template <> +struct is_integral_type { + typedef int test; +}; #endif #ifdef BOOST_ATOMIC_HAVE_GNU_128BIT_INTEGERS -template<> struct is_integral_type<__uint128_t> {typedef int test;}; -template<> struct is_integral_type<__int128_t> {typedef int test;}; +template <> +struct is_integral_type<__uint128_t> { + typedef int test; +}; +template <> +struct is_integral_type<__int128_t> { + typedef int test; +}; #endif #if BOOST_MSVC >= 1500 && (defined(_M_IA64) || defined(_M_AMD64)) && \ defined(BOOST_ATOMIC_HAVE_SSE2) #include -template<> struct is_integral_type<__m128i> {typedef int test;}; +template <> +struct is_integral_type<__m128i> { + typedef int test; +}; #endif -} -} -} +} // namespace atomic +} // namespace detail +} // namespace boost #endif diff --git a/external/atomic/boost/atomic/platform.hpp b/external/atomic/boost/atomic/platform.hpp index d0068fd1..911c9fd7 100644 --- a/external/atomic/boost/atomic/platform.hpp +++ b/external/atomic/boost/atomic/platform.hpp @@ -9,37 +9,38 @@ #if (defined(__GNUC__) || defined(__INTEL_COMPILER)) && \ (defined(__i386__) || defined(__amd64__) || defined(__x86_64__)) - #include +#include #elif defined(__GNUC__) && defined(__alpha__) - #include +#include #elif defined(__GNUC__) && (defined(__POWERPC__) || defined(__PPC__)) - #include +#include // This list of ARM architecture versions comes from Apple's arm/arch.h header. // I don't know how complete it is. -#elif defined(__GNUC__) && (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) \ - || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ - || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_7A__)) +#elif defined(__GNUC__) && \ + (defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \ + defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_7A__)) - #include +#include #elif defined(__linux__) && defined(__arm__) - #include +#include -#elif defined(BOOST_USE_WINDOWS_H) || defined(_WIN32_CE) || defined(BOOST_MSVC) \ - || defined(BOOST_INTEL_WIN) || defined(WIN32) || defined(_WIN32) \ - || defined(__WIN32__) || defined(__CYGWIN__) +#elif defined(BOOST_USE_WINDOWS_H) || defined(_WIN32_CE) || \ + defined(BOOST_MSVC) || defined(BOOST_INTEL_WIN) || defined(WIN32) || \ + defined(_WIN32) || defined(__WIN32__) || defined(__CYGWIN__) - #include +#include #else - #warning "Using slow fallback atomic implementation" - #include +#warning "Using slow fallback atomic implementation" +#include #endif diff --git a/external/lockfree/boost/lockfree/detail/atomic.hpp b/external/lockfree/boost/lockfree/detail/atomic.hpp index 8e990647..7b6a0e1b 100644 --- a/external/lockfree/boost/lockfree/detail/atomic.hpp +++ b/external/lockfree/boost/lockfree/detail/atomic.hpp @@ -9,12 +9,12 @@ #if !defined(BOOST_NO_0X_HDR_ATOMIC) #ifdef __GNUC__ -# if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6) - || !defined(__GXX_EXPERIMENTAL_CXX0X__) -# define BOOST_NO_0X_HDR_ATOMIC -# endif +#if __GNUC__ < 4 || (__GNUC__ == 4 && __GNUC_MINOR__ < 6) +|| !defined(__GXX_EXPERIMENTAL_CXX0X__) +#define BOOST_NO_0X_HDR_ATOMIC +#endif #else -# define BOOST_NO_0X_HDR_ATOMIC +#define BOOST_NO_0X_HDR_ATOMIC #endif #endif @@ -24,16 +24,16 @@ #include #endif -namespace boost { -namespace lockfree { -namespace detail { + namespace boost { + namespace lockfree { + namespace detail { #ifdef BOOST_NO_0X_HDR_ATOMIC -using boost::atomic; -using boost::memory_order_acquire; -using boost::memory_order_consume; -using boost::memory_order_relaxed; -using boost::memory_order_release; + using boost::atomic; + using boost::memory_order_acquire; + using boost::memory_order_consume; + using boost::memory_order_relaxed; + using boost::memory_order_release; #else using std::atomic; using std::memory_order_acquire; @@ -42,13 +42,14 @@ using std::memory_order_relaxed; using std::memory_order_release; #endif -} -using detail::atomic; -using detail::memory_order_acquire; -using detail::memory_order_consume; -using detail::memory_order_relaxed; -using detail::memory_order_release; + } // namespace detail + using detail::atomic; + using detail::memory_order_acquire; + using detail::memory_order_consume; + using detail::memory_order_relaxed; + using detail::memory_order_release; -}} + } // namespace lockfree +} // namespace boost #endif /* BOOST_LOCKFREE_DETAIL_ATOMIC_HPP */ diff --git a/external/lockfree/boost/lockfree/detail/branch_hints.hpp b/external/lockfree/boost/lockfree/detail/branch_hints.hpp index 7b5ed24d..8b3dbb8e 100644 --- a/external/lockfree/boost/lockfree/detail/branch_hints.hpp +++ b/external/lockfree/boost/lockfree/detail/branch_hints.hpp @@ -10,29 +10,25 @@ #ifndef BOOST_LOCKFREE_BRANCH_HINTS_HPP_INCLUDED #define BOOST_LOCKFREE_BRANCH_HINTS_HPP_INCLUDED -namespace boost -{ -namespace lockfree -{ - /** \brief hint for the branch prediction */ - inline bool likely(bool expr) - { +namespace boost { +namespace lockfree { +/** \brief hint for the branch prediction */ +inline bool likely(bool expr) { #ifdef __GNUC__ - return __builtin_expect(expr, true); + return __builtin_expect(expr, true); #else - return expr; + return expr; #endif - } +} - /** \brief hint for the branch prediction */ - inline bool unlikely(bool expr) - { +/** \brief hint for the branch prediction */ +inline bool unlikely(bool expr) { #ifdef __GNUC__ - return __builtin_expect(expr, false); + return __builtin_expect(expr, false); #else - return expr; + return expr; #endif - } +} } /* namespace lockfree */ } /* namespace boost */ diff --git a/external/lockfree/boost/lockfree/detail/freelist.hpp b/external/lockfree/boost/lockfree/detail/freelist.hpp index 6083c827..5efd3bad 100644 --- a/external/lockfree/boost/lockfree/detail/freelist.hpp +++ b/external/lockfree/boost/lockfree/detail/freelist.hpp @@ -21,218 +21,177 @@ #include #include -#include /* for std::min */ +#include /* for std::min */ #include -namespace boost -{ -namespace lockfree -{ -namespace detail -{ +namespace boost { +namespace lockfree { +namespace detail { -struct freelist_node -{ - tagged_ptr next; +struct freelist_node { + tagged_ptr next; }; -template - > -class freelist_stack: - Alloc -{ - typedef tagged_ptr tagged_node_ptr; - -public: - freelist_stack (std::size_t n = 0): - pool_(tagged_node_ptr(nullptr)) - { - reserve_unsafe(n); - } - - void reserve (std::size_t count) - { - for (std::size_t i = 0; i != count; ++i) { - T * node = Alloc::allocate(1); - deallocate(node); - } - } +template > +class freelist_stack : Alloc { + typedef tagged_ptr tagged_node_ptr; - void reserve_unsafe (std::size_t count) - { - for (std::size_t i = 0; i != count; ++i) { - T * node = Alloc::allocate(1); - deallocate_unsafe(node); - } - } - - T * construct (void) - { - T * node = allocate(); - if (node) - new(node) T(); - return node; - } + public: + freelist_stack(std::size_t n = 0) : pool_(tagged_node_ptr(nullptr)) { + reserve_unsafe(n); + } - template - T * construct (ArgumentType const & arg) - { - T * node = allocate(); - if (node) - new(node) T(arg); - return node; + void reserve(std::size_t count) { + for (std::size_t i = 0; i != count; ++i) { + T* node = Alloc::allocate(1); + deallocate(node); } + } - T * construct_unsafe (void) - { - T * node = allocate_unsafe(); - if (node) - new(node) T(); - return node; + void reserve_unsafe(std::size_t count) { + for (std::size_t i = 0; i != count; ++i) { + T* node = Alloc::allocate(1); + deallocate_unsafe(node); } - - template - T * construct_unsafe (ArgumentType const & arg) - { - T * node = allocate_unsafe(); - if (node) - new(node) T(arg); - return node; - } - - - void destruct (T * n) - { - n->~T(); - deallocate(n); + } + + T* construct(void) { + T* node = allocate(); + if (node) new (node) T(); + return node; + } + + template + T* construct(ArgumentType const& arg) { + T* node = allocate(); + if (node) new (node) T(arg); + return node; + } + + T* construct_unsafe(void) { + T* node = allocate_unsafe(); + if (node) new (node) T(); + return node; + } + + template + T* construct_unsafe(ArgumentType const& arg) { + T* node = allocate_unsafe(); + if (node) new (node) T(arg); + return node; + } + + void destruct(T* n) { + n->~T(); + deallocate(n); + } + + void destruct_unsafe(T* n) { + n->~T(); + deallocate_unsafe(n); + } + + T* allocate(void) { + tagged_node_ptr old_pool = pool_.load(memory_order_consume); + + for (;;) { + if (!old_pool.get_ptr()) { + if (allocate_may_allocate) + return Alloc::allocate(1); + else + return 0; + } + + freelist_node* new_pool_ptr = old_pool->next.get_ptr(); + tagged_node_ptr new_pool(new_pool_ptr, old_pool.get_tag() + 1); + + if (pool_.compare_exchange_weak(old_pool, new_pool)) { + void* ptr = old_pool.get_ptr(); + return reinterpret_cast(ptr); + } } + } - void destruct_unsafe (T * n) - { - n->~T(); - deallocate_unsafe(n); - } + T* allocate_unsafe(void) { + tagged_node_ptr old_pool = pool_.load(memory_order_relaxed); - T * allocate (void) - { - tagged_node_ptr old_pool = pool_.load(memory_order_consume); - - for(;;) { - if (!old_pool.get_ptr()) { - if (allocate_may_allocate) - return Alloc::allocate(1); - else - return 0; - } - - freelist_node * new_pool_ptr = old_pool->next.get_ptr(); - tagged_node_ptr new_pool (new_pool_ptr, old_pool.get_tag() + 1); - - if (pool_.compare_exchange_weak(old_pool, new_pool)) { - void * ptr = old_pool.get_ptr(); - return reinterpret_cast(ptr); - } - } + if (!old_pool.get_ptr()) { + if (allocate_may_allocate) + return Alloc::allocate(1); + else + return 0; } - T * allocate_unsafe (void) - { - tagged_node_ptr old_pool = pool_.load(memory_order_relaxed); - - if (!old_pool.get_ptr()) { - if (allocate_may_allocate) - return Alloc::allocate(1); - else - return 0; - } - - freelist_node * new_pool_ptr = old_pool->next.get_ptr(); - tagged_node_ptr new_pool (new_pool_ptr, old_pool.get_tag() + 1); + freelist_node* new_pool_ptr = old_pool->next.get_ptr(); + tagged_node_ptr new_pool(new_pool_ptr, old_pool.get_tag() + 1); - pool_.store(new_pool, memory_order_relaxed); - void * ptr = old_pool.get_ptr(); - return reinterpret_cast(ptr); - } + pool_.store(new_pool, memory_order_relaxed); + void* ptr = old_pool.get_ptr(); + return reinterpret_cast(ptr); + } - void deallocate (T * n) - { - void * node = n; - tagged_node_ptr old_pool = pool_.load(memory_order_consume); - freelist_node * new_pool_ptr = reinterpret_cast(node); + void deallocate(T* n) { + void* node = n; + tagged_node_ptr old_pool = pool_.load(memory_order_consume); + freelist_node* new_pool_ptr = reinterpret_cast(node); - for(;;) { - tagged_node_ptr new_pool (new_pool_ptr, old_pool.get_tag()); - new_pool->next.set_ptr(old_pool.get_ptr()); + for (;;) { + tagged_node_ptr new_pool(new_pool_ptr, old_pool.get_tag()); + new_pool->next.set_ptr(old_pool.get_ptr()); - if (pool_.compare_exchange_weak(old_pool, new_pool)) - return; - } + if (pool_.compare_exchange_weak(old_pool, new_pool)) return; } + } - void deallocate_unsafe (T * n) - { - void * node = n; - tagged_node_ptr old_pool = pool_.load(memory_order_relaxed); - freelist_node * new_pool_ptr = reinterpret_cast(node); + void deallocate_unsafe(T* n) { + void* node = n; + tagged_node_ptr old_pool = pool_.load(memory_order_relaxed); + freelist_node* new_pool_ptr = reinterpret_cast(node); - tagged_node_ptr new_pool (new_pool_ptr, old_pool.get_tag()); - new_pool->next.set_ptr(old_pool.get_ptr()); + tagged_node_ptr new_pool(new_pool_ptr, old_pool.get_tag()); + new_pool->next.set_ptr(old_pool.get_ptr()); - pool_.store(new_pool, memory_order_relaxed); - } + pool_.store(new_pool, memory_order_relaxed); + } - ~freelist_stack(void) - { - tagged_node_ptr current (pool_); + ~freelist_stack(void) { + tagged_node_ptr current(pool_); - while (current) { - freelist_node * current_ptr = current.get_ptr(); - if (current_ptr) - current = current_ptr->next; - Alloc::deallocate(reinterpret_cast(current_ptr), 1); - } + while (current) { + freelist_node* current_ptr = current.get_ptr(); + if (current_ptr) current = current_ptr->next; + Alloc::deallocate(reinterpret_cast(current_ptr), 1); } + } - bool is_lock_free(void) const - { - return pool_.is_lock_free(); - } + bool is_lock_free(void) const { return pool_.is_lock_free(); } -private: - atomic pool_; + private: + atomic pool_; }; } /* namespace detail */ - ////////////////////////////////////////////////////////////////////////////// // backwards compatibility template > -class caching_freelist : public detail::freelist_stack -{ -public: - caching_freelist (std::size_t n = 0) - : detail::freelist_stack(n) - {} +class caching_freelist : public detail::freelist_stack { + public: + caching_freelist(std::size_t n = 0) + : detail::freelist_stack(n) {} }; template > -class static_freelist : public detail::freelist_stack -{ -public: - static_freelist (std::size_t n = 0) - : detail::freelist_stack(n) - {} +class static_freelist : public detail::freelist_stack { + public: + static_freelist(std::size_t n = 0) + : detail::freelist_stack(n) {} }; - struct caching_freelist_t {}; struct static_freelist_t {}; - - } /* namespace lockfree */ } /* namespace boost */ diff --git a/external/lockfree/boost/lockfree/detail/prefix.hpp b/external/lockfree/boost/lockfree/detail/prefix.hpp index c7d5bcb4..f5b84ad6 100644 --- a/external/lockfree/boost/lockfree/detail/prefix.hpp +++ b/external/lockfree/boost/lockfree/detail/prefix.hpp @@ -12,9 +12,9 @@ /* this file defines the following macros: BOOST_LOCKFREE_CACHELINE_BYTES: size of a cache line BOOST_LOCKFREE_PTR_COMPRESSION: use tag/pointer compression to utilize parts - of the virtual address space as tag (at least 16bit) - BOOST_LOCKFREE_DCAS_ALIGNMENT: symbol used for aligning structs at cache line - boundaries + of the virtual address space as tag (at least + 16bit) BOOST_LOCKFREE_DCAS_ALIGNMENT: symbol used for aligning structs at + cache line boundaries */ #define BOOST_LOCKFREE_CACHELINE_BYTES 64 @@ -22,13 +22,13 @@ #ifdef _MSC_VER #define BOOST_LOCKFREE_CACHELINE_ALIGNMENT \ - __declspec(align(BOOST_LOCKFREE_CACHELINE_BYTES)) + __declspec(align(BOOST_LOCKFREE_CACHELINE_BYTES)) #if defined(_M_IX86) - #define BOOST_LOCKFREE_DCAS_ALIGNMENT +#define BOOST_LOCKFREE_DCAS_ALIGNMENT #elif defined(_M_X64) || defined(_M_IA64) - #define BOOST_LOCKFREE_PTR_COMPRESSION 1 - #define BOOST_LOCKFREE_DCAS_ALIGNMENT __declspec(align(16)) +#define BOOST_LOCKFREE_PTR_COMPRESSION 1 +#define BOOST_LOCKFREE_DCAS_ALIGNMENT __declspec(align(16)) #endif #endif /* _MSC_VER */ @@ -36,17 +36,17 @@ #ifdef __GNUC__ #define BOOST_LOCKFREE_CACHELINE_ALIGNMENT \ - __attribute__((aligned(BOOST_LOCKFREE_CACHELINE_BYTES))) + __attribute__((aligned(BOOST_LOCKFREE_CACHELINE_BYTES))) #if defined(__i386__) || defined(__ppc__) - #define BOOST_LOCKFREE_DCAS_ALIGNMENT +#define BOOST_LOCKFREE_DCAS_ALIGNMENT #elif defined(__x86_64__) - #define BOOST_LOCKFREE_PTR_COMPRESSION 1 - #define BOOST_LOCKFREE_DCAS_ALIGNMENT __attribute__((aligned(16))) +#define BOOST_LOCKFREE_PTR_COMPRESSION 1 +#define BOOST_LOCKFREE_DCAS_ALIGNMENT __attribute__((aligned(16))) #elif defined(__alpha__) - // LATER: alpha may benefit from pointer compression. - // but what is the maximum size of the address space? - #define BOOST_LOCKFREE_DCAS_ALIGNMENT +// LATER: alpha may benefit from pointer compression. +// but what is the maximum size of the address space? +#define BOOST_LOCKFREE_DCAS_ALIGNMENT #endif #endif /* __GNUC__ */ diff --git a/external/lockfree/boost/lockfree/detail/tagged_ptr_dcas.hpp b/external/lockfree/boost/lockfree/detail/tagged_ptr_dcas.hpp index 8bc68386..9a695931 100644 --- a/external/lockfree/boost/lockfree/detail/tagged_ptr_dcas.hpp +++ b/external/lockfree/boost/lockfree/detail/tagged_ptr_dcas.hpp @@ -9,7 +9,7 @@ #ifndef BOOST_LOCKFREE_TAGGED_PTR_DCAS_HPP_INCLUDED #define BOOST_LOCKFREE_TAGGED_PTR_DCAS_HPP_INCLUDED -#include /* for std::size_t */ +#include /* for std::size_t */ #include #include @@ -19,112 +19,79 @@ namespace lockfree { namespace detail { template -class BOOST_LOCKFREE_DCAS_ALIGNMENT tagged_ptr -{ -public: - typedef std::size_t tag_t; +class BOOST_LOCKFREE_DCAS_ALIGNMENT tagged_ptr { + public: + typedef std::size_t tag_t; - /** uninitialized constructor */ - tagged_ptr(void) BOOST_NOEXCEPT//: ptr(0), tag(0) - {} + /** uninitialized constructor */ + tagged_ptr(void) BOOST_NOEXCEPT //: ptr(0), tag(0) + {} #ifdef BOOST_NO_CXX11_DEFAULTED_FUNCTIONS - tagged_ptr(tagged_ptr const & p): - ptr(p.ptr), tag(p.tag) - {} + tagged_ptr(tagged_ptr const& p) : ptr(p.ptr), tag(p.tag) {} #else - tagged_ptr(tagged_ptr const & p) = default; + tagged_ptr(tagged_ptr const& p) = default; #endif - explicit tagged_ptr(T * p, tag_t t = 0): - ptr(p), tag(t) - {} + explicit tagged_ptr(T* p, tag_t t = 0) : ptr(p), tag(t) {} - /** unsafe set operation */ - /* @{ */ + /** unsafe set operation */ + /* @{ */ #ifdef BOOST_NO_CXX11_DEFAULTED_FUNCTIONS - tagged_ptr & operator= (tagged_ptr const & p) - { - set(p.ptr, p.tag); - return *this; - } + tagged_ptr& operator=(tagged_ptr const& p) { + set(p.ptr, p.tag); + return *this; + } #else - tagged_ptr & operator= (tagged_ptr const & p) = default; + tagged_ptr& operator=(tagged_ptr const& p) = default; #endif - void set(T * p, tag_t t) - { - ptr = p; - tag = t; - } - /* @} */ - - /** comparing semantics */ - /* @{ */ - bool operator== (volatile tagged_ptr const & p) const - { - return (ptr == p.ptr) && (tag == p.tag); - } - - bool operator!= (volatile tagged_ptr const & p) const - { - return !operator==(p); - } - /* @} */ - - /** pointer access */ - /* @{ */ - T * get_ptr(void) const - { - return ptr; - } - - void set_ptr(T * p) - { - ptr = p; - } - /* @} */ - - /** tag access */ - /* @{ */ - tag_t get_tag() const - { - return tag; - } - - tag_t get_next_tag() const - { - tag_t next = (get_tag() + 1) & (std::numeric_limits::max)(); - return next; - } - - void set_tag(tag_t t) - { - tag = t; - } - /* @} */ - - /** smart pointer support */ - /* @{ */ - T & operator*() const - { - return *ptr; - } - - T * operator->() const - { - return ptr; - } - - operator bool(void) const - { - return ptr != 0; - } - /* @} */ - -protected: - T * ptr; - tag_t tag; + void set(T* p, tag_t t) { + ptr = p; + tag = t; + } + /* @} */ + + /** comparing semantics */ + /* @{ */ + bool operator==(volatile tagged_ptr const& p) const { + return (ptr == p.ptr) && (tag == p.tag); + } + + bool operator!=(volatile tagged_ptr const& p) const { return !operator==(p); } + /* @} */ + + /** pointer access */ + /* @{ */ + T* get_ptr(void) const { return ptr; } + + void set_ptr(T* p) { ptr = p; } + /* @} */ + + /** tag access */ + /* @{ */ + tag_t get_tag() const { return tag; } + + tag_t get_next_tag() const { + tag_t next = (get_tag() + 1) & (std::numeric_limits::max)(); + return next; + } + + void set_tag(tag_t t) { tag = t; } + /* @} */ + + /** smart pointer support */ + /* @{ */ + T& operator*() const { return *ptr; } + + T* operator->() const { return ptr; } + + operator bool(void) const { return ptr != 0; } + /* @} */ + + protected: + T* ptr; + tag_t tag; }; } /* namespace detail */ diff --git a/external/lockfree/boost/lockfree/detail/tagged_ptr_ptrcompression.hpp b/external/lockfree/boost/lockfree/detail/tagged_ptr_ptrcompression.hpp index 668abed8..f2f69c53 100644 --- a/external/lockfree/boost/lockfree/detail/tagged_ptr_ptrcompression.hpp +++ b/external/lockfree/boost/lockfree/detail/tagged_ptr_ptrcompression.hpp @@ -9,7 +9,7 @@ #ifndef BOOST_LOCKFREE_TAGGED_PTR_PTRCOMPRESSION_HPP_INCLUDED #define BOOST_LOCKFREE_TAGGED_PTR_PTRCOMPRESSION_HPP_INCLUDED -#include /* for std::size_t */ +#include /* for std::size_t */ #include #include @@ -19,149 +19,112 @@ namespace boost { namespace lockfree { namespace detail { -#if defined (__x86_64__) || defined (_M_X64) +#if defined(__x86_64__) || defined(_M_X64) template -class tagged_ptr -{ - typedef std::uint64_t compressed_ptr_t; - -public: - typedef std::uint16_t tag_t; - -private: - union cast_unit - { - compressed_ptr_t value; - tag_t tag[4]; - }; - - static const int tag_index = 3; - static const compressed_ptr_t ptr_mask = 0xffffffffffffUL; //(1L<<48L)-1; - - static T* extract_ptr(volatile compressed_ptr_t const & i) - { - return reinterpret_cast(i & ptr_mask); - } - - static tag_t extract_tag(volatile compressed_ptr_t const & i) - { - cast_unit cu; - cu.value = i; - return cu.tag[tag_index]; - } - - static compressed_ptr_t pack_ptr(T * ptr, int tag) - { - cast_unit ret; - ret.value = compressed_ptr_t(ptr); - ret.tag[tag_index] = tag; - return ret.value; - } - -public: - /** uninitialized constructor */ - tagged_ptr(void) BOOST_NOEXCEPT//: ptr(0), tag(0) - {} - - /** copy constructor */ +class tagged_ptr { + typedef std::uint64_t compressed_ptr_t; + + public: + typedef std::uint16_t tag_t; + + private: + union cast_unit { + compressed_ptr_t value; + tag_t tag[4]; + }; + + static const int tag_index = 3; + static const compressed_ptr_t ptr_mask = 0xffffffffffffUL; //(1L<<48L)-1; + + static T* extract_ptr(volatile compressed_ptr_t const& i) { + return reinterpret_cast(i & ptr_mask); + } + + static tag_t extract_tag(volatile compressed_ptr_t const& i) { + cast_unit cu; + cu.value = i; + return cu.tag[tag_index]; + } + + static compressed_ptr_t pack_ptr(T* ptr, int tag) { + cast_unit ret; + ret.value = compressed_ptr_t(ptr); + ret.tag[tag_index] = tag; + return ret.value; + } + + public: + /** uninitialized constructor */ + tagged_ptr(void) BOOST_NOEXCEPT //: ptr(0), tag(0) + {} + + /** copy constructor */ #ifdef BOOST_NO_CXX11_DEFAULTED_FUNCTIONS - tagged_ptr(tagged_ptr const & p): - ptr(p.ptr) - {} + tagged_ptr(tagged_ptr const& p) : ptr(p.ptr) {} #else - tagged_ptr(tagged_ptr const & p) = default; + tagged_ptr(tagged_ptr const& p) = default; #endif - explicit tagged_ptr(T * p, tag_t t = 0): - ptr(pack_ptr(p, t)) - {} + explicit tagged_ptr(T* p, tag_t t = 0) : ptr(pack_ptr(p, t)) {} - /** unsafe set operation */ - /* @{ */ + /** unsafe set operation */ + /* @{ */ #ifdef BOOST_NO_CXX11_DEFAULTED_FUNCTIONS - tagged_ptr & operator= (tagged_ptr const & p) - { - ptr = p.ptr; - return *this; - } + tagged_ptr& operator=(tagged_ptr const& p) { + ptr = p.ptr; + return *this; + } #else - tagged_ptr & operator= (tagged_ptr const & p) = default; + tagged_ptr& operator=(tagged_ptr const& p) = default; #endif - void set(T * p, tag_t t) - { - ptr = pack_ptr(p, t); - } - /* @} */ - - /** comparing semantics */ - /* @{ */ - bool operator== (volatile tagged_ptr const & p) const - { - return (ptr == p.ptr); - } - - bool operator!= (volatile tagged_ptr const & p) const - { - return !operator==(p); - } - /* @} */ - - /** pointer access */ - /* @{ */ - T * get_ptr() const - { - return extract_ptr(ptr); - } - - void set_ptr(T * p) - { - tag_t tag = get_tag(); - ptr = pack_ptr(p, tag); - } - /* @} */ - - /** tag access */ - /* @{ */ - tag_t get_tag() const - { - return extract_tag(ptr); - } - - tag_t get_next_tag() const - { - tag_t next = (get_tag() + 1) & (std::numeric_limits::max)(); - return next; - } - - void set_tag(tag_t t) - { - T * p = get_ptr(); - ptr = pack_ptr(p, t); - } - /* @} */ - - /** smart pointer support */ - /* @{ */ - T & operator*() const - { - return *get_ptr(); - } - - T * operator->() const - { - return get_ptr(); - } - - operator bool(void) const - { - return get_ptr() != 0; - } - /* @} */ - -protected: - compressed_ptr_t ptr; + void set(T* p, tag_t t) { ptr = pack_ptr(p, t); } + /* @} */ + + /** comparing semantics */ + /* @{ */ + bool operator==(volatile tagged_ptr const& p) const { return (ptr == p.ptr); } + + bool operator!=(volatile tagged_ptr const& p) const { return !operator==(p); } + /* @} */ + + /** pointer access */ + /* @{ */ + T* get_ptr() const { return extract_ptr(ptr); } + + void set_ptr(T* p) { + tag_t tag = get_tag(); + ptr = pack_ptr(p, tag); + } + /* @} */ + + /** tag access */ + /* @{ */ + tag_t get_tag() const { return extract_tag(ptr); } + + tag_t get_next_tag() const { + tag_t next = (get_tag() + 1) & (std::numeric_limits::max)(); + return next; + } + + void set_tag(tag_t t) { + T* p = get_ptr(); + ptr = pack_ptr(p, t); + } + /* @} */ + + /** smart pointer support */ + /* @{ */ + T& operator*() const { return *get_ptr(); } + + T* operator->() const { return get_ptr(); } + + operator bool(void) const { return get_ptr() != 0; } + /* @} */ + + protected: + compressed_ptr_t ptr; }; #else #error unsupported platform diff --git a/external/lockfree/boost/lockfree/queue.hpp b/external/lockfree/boost/lockfree/queue.hpp index 16f2b346..9346c1f7 100644 --- a/external/lockfree/boost/lockfree/queue.hpp +++ b/external/lockfree/boost/lockfree/queue.hpp @@ -1,6 +1,7 @@ // lock-free fifo queue from // Michael, M. M. and Scott, M. L., -// "simple, fast and practical non-blocking and blocking concurrent queue algorithms" +// "simple, fast and practical non-blocking and blocking concurrent queue +// algorithms" // // implementation for c++ // @@ -26,297 +27,265 @@ #include #include -#include /* std::auto_ptr */ +#include /* std::auto_ptr */ namespace boost { namespace lockfree { namespace detail { template -class queue: - boost::noncopyable -{ -private: +class queue : boost::noncopyable { + private: #ifndef BOOST_DOXYGEN_INVOKED - BOOST_STATIC_ASSERT(boost::is_pod::value); - - struct BOOST_LOCKFREE_CACHELINE_ALIGNMENT node - { - typedef tagged_ptr tagged_node_ptr; - - node(T const & v): - data(v) - { - /* increment tag to avoid ABA problem */ - tagged_node_ptr old_next = next.load(memory_order_relaxed); - tagged_node_ptr new_next (nullptr, old_next.get_tag()+1); - next.store(new_next, memory_order_release); - } - - node (void): - next(tagged_node_ptr(nullptr, 0)) - {} - - atomic next; - T data; - }; + BOOST_STATIC_ASSERT(boost::is_pod::value); + struct BOOST_LOCKFREE_CACHELINE_ALIGNMENT node { typedef tagged_ptr tagged_node_ptr; - typedef typename Alloc::template rebind::other node_allocator; - - typedef typename boost::mpl::if_, - detail::freelist_stack, - detail::freelist_stack - >::type pool_t; - - void initialize(void) - { - node * n = pool.construct(); - tagged_node_ptr dummy_node(n, 0); - head_.store(dummy_node, memory_order_relaxed); - tail_.store(dummy_node, memory_order_release); + node(T const& v) : data(v) { + /* increment tag to avoid ABA problem */ + tagged_node_ptr old_next = next.load(memory_order_relaxed); + tagged_node_ptr new_next(nullptr, old_next.get_tag() + 1); + next.store(new_next, memory_order_release); } -#endif -public: - /** - * \return true, if implementation is lock-free. - * - * \warning \b Warning: It only checks, if the queue head node is lockfree. - * On most platforms, the whole implementation is - * lockfree, if this is true. Using c++0x-style atomics, there - * is no possibility to provide a completely - * accurate implementation, because one would need to test every - * internal node, which is impossible - * if further nodes will be allocated from the operating system. - * */ - bool is_lock_free (void) const - { - return head_.is_lock_free() && pool.is_lock_free(); - } + node(void) : next(tagged_node_ptr(nullptr, 0)) {} - //! Construct queue. - queue(void) - { - pool.reserve_unsafe(1); - initialize(); - } + atomic next; + T data; + }; - //! Construct queue, allocate n nodes for the freelist. - explicit queue(std::size_t n) - { - pool.reserve_unsafe(n+1); - initialize(); - } + typedef tagged_ptr tagged_node_ptr; - //! \copydoc boost::lockfree::stack::reserve - void reserve(std::size_t n) - { - pool.reserve(n); - } + typedef typename Alloc::template rebind::other node_allocator; - //! \copydoc boost::lockfree::stack::reserve_unsafe - void reserve_unsafe(std::size_t n) - { - pool.reserve_unsafe(n); - } + typedef typename boost::mpl::if_< + boost::is_same, + detail::freelist_stack, + detail::freelist_stack >::type pool_t; - /** Destroys queue, free all nodes from freelist. - * */ - ~queue(void) - { - if (!empty()) { - T dummy; - while(pop_unsafe(dummy)) - ; - } - pool.destruct(head_.load(memory_order_relaxed).get_ptr()); - } + void initialize(void) { + node* n = pool.construct(); + tagged_node_ptr dummy_node(n, 0); + head_.store(dummy_node, memory_order_relaxed); + tail_.store(dummy_node, memory_order_release); + } +#endif - /** Check if the ringbuffer is empty - * - * \warning Not thread-safe, use for debugging purposes only - * */ - bool empty(void) - { - return head_.load().get_ptr() == tail_.load().get_ptr(); + public: + /** + * \return true, if implementation is lock-free. + * + * \warning \b Warning: It only checks, if the queue head node is lockfree. + * On most platforms, the whole implementation is + * lockfree, if this is true. Using c++0x-style atomics, + * there is no possibility to provide a completely accurate implementation, + * because one would need to test every internal node, which is impossible if + * further nodes will be allocated from the operating system. + * */ + bool is_lock_free(void) const { + return head_.is_lock_free() && pool.is_lock_free(); + } + + //! Construct queue. + queue(void) { + pool.reserve_unsafe(1); + initialize(); + } + + //! Construct queue, allocate n nodes for the freelist. + explicit queue(std::size_t n) { + pool.reserve_unsafe(n + 1); + initialize(); + } + + //! \copydoc boost::lockfree::stack::reserve + void reserve(std::size_t n) { pool.reserve(n); } + + //! \copydoc boost::lockfree::stack::reserve_unsafe + void reserve_unsafe(std::size_t n) { pool.reserve_unsafe(n); } + + /** Destroys queue, free all nodes from freelist. + * */ + ~queue(void) { + if (!empty()) { + T dummy; + while (pop_unsafe(dummy)) + ; } - - /** Enqueues object t to the queue. Enqueueing may fail, - * if the freelist is not able to allocate a new queue node. - * - * \returns true, if the push operation is successful. - * - * \note Thread-safe and non-blocking - * \warning \b Warning: - * May block if node needs to be allocated from the operating system - * */ - bool push(T const & t) - { - node * n = pool.construct(t); - - if (n == nullptr) - return false; - - for (;;) { - tagged_node_ptr tail = tail_.load(memory_order_acquire); - tagged_node_ptr next = tail->next.load(memory_order_acquire); - node * next_ptr = next.get_ptr(); - - tagged_node_ptr tail2 = tail_.load(memory_order_acquire); - if (likely(tail == tail2)) { - if (next_ptr == 0) { - if ( tail->next.compare_exchange_weak(next, tagged_node_ptr(n, - next.get_tag() + 1)) ) { - tail_.compare_exchange_strong(tail, tagged_node_ptr(n, - tail.get_tag() + 1)); - return true; - } - } - else - tail_.compare_exchange_strong(tail, tagged_node_ptr(next_ptr, - tail.get_tag() + 1)); - } - } + pool.destruct(head_.load(memory_order_relaxed).get_ptr()); + } + + /** Check if the ringbuffer is empty + * + * \warning Not thread-safe, use for debugging purposes only + * */ + bool empty(void) { return head_.load().get_ptr() == tail_.load().get_ptr(); } + + /** Enqueues object t to the queue. Enqueueing may fail, + * if the freelist is not able to allocate a new queue node. + * + * \returns true, if the push operation is successful. + * + * \note Thread-safe and non-blocking + * \warning \b Warning: + * May block if node needs to be allocated from the operating system + * */ + bool push(T const& t) { + node* n = pool.construct(t); + + if (n == nullptr) return false; + + for (;;) { + tagged_node_ptr tail = tail_.load(memory_order_acquire); + tagged_node_ptr next = tail->next.load(memory_order_acquire); + node* next_ptr = next.get_ptr(); + + tagged_node_ptr tail2 = tail_.load(memory_order_acquire); + if (likely(tail == tail2)) { + if (next_ptr == 0) { + if (tail->next.compare_exchange_weak( + next, tagged_node_ptr(n, next.get_tag() + 1))) { + tail_.compare_exchange_strong( + tail, tagged_node_ptr(n, tail.get_tag() + 1)); + return true; + } + } else + tail_.compare_exchange_strong( + tail, tagged_node_ptr(next_ptr, tail.get_tag() + 1)); + } } - - /** Enqueues object t to the queue. Enqueueing may fail, - * if the freelist is not able to allocate a new queue node. - * - * \returns true, if the push operation is successful. - * - * \note Not thread-safe - * \warning \b Warning: May block if node needs to be - * allocated from the operating system - * */ - bool push_unsafe(T const & t) - { - node * n = pool.construct_unsafe(t); - - if (n == nullptr) - return false; - - for (;;) - { - tagged_node_ptr tail = tail_.load(memory_order_relaxed); - tagged_node_ptr next = tail->next.load(memory_order_relaxed); - node * next_ptr = next.get_ptr(); - - if (next_ptr == 0) { - tail->next.store(tagged_node_ptr(n, next.get_tag() + 1), + } + + /** Enqueues object t to the queue. Enqueueing may fail, + * if the freelist is not able to allocate a new queue node. + * + * \returns true, if the push operation is successful. + * + * \note Not thread-safe + * \warning \b Warning: May block if node needs to be + * allocated from the operating system + * */ + bool push_unsafe(T const& t) { + node* n = pool.construct_unsafe(t); + + if (n == nullptr) return false; + + for (;;) { + tagged_node_ptr tail = tail_.load(memory_order_relaxed); + tagged_node_ptr next = tail->next.load(memory_order_relaxed); + node* next_ptr = next.get_ptr(); + + if (next_ptr == 0) { + tail->next.store(tagged_node_ptr(n, next.get_tag() + 1), + memory_order_relaxed); + tail_.store(tagged_node_ptr(n, tail.get_tag() + 1), memory_order_relaxed); - tail_.store(tagged_node_ptr(n, tail.get_tag() + 1), + return true; + } else + tail_.store(tagged_node_ptr(next_ptr, tail.get_tag() + 1), memory_order_relaxed); - return true; - } - else - tail_.store(tagged_node_ptr(next_ptr, tail.get_tag() + 1), - memory_order_relaxed); - } } - - /** Dequeue object from queue. - * - * if pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if queue was empty. - * - * \note Thread-safe and non-blocking - * - * */ - bool pop (T & ret) - { - for (;;) { - tagged_node_ptr head = head_.load(memory_order_acquire); - tagged_node_ptr tail = tail_.load(memory_order_acquire); - tagged_node_ptr next = head->next.load(memory_order_acquire); - node * next_ptr = next.get_ptr(); - -// tagged_node_ptr head2 = head_.load(memory_order_acquire); -// if (likely(head == head2)) - { - if (head.get_ptr() == tail.get_ptr()) { - if (next_ptr == 0) - return false; - tail_.compare_exchange_strong(tail, tagged_node_ptr(next_ptr, - tail.get_tag() + 1)); - } else { - if (next_ptr == 0) - /* this check is not part of the original algorithm - * as published by michael and scott - * - * however we reuse the tagged_ptr part for the and - * clear the next part during node - * allocation. we can observe a null-pointer here. - * */ - continue; - ret = next_ptr->data; - if (head_.compare_exchange_weak(head, tagged_node_ptr(next_ptr, - head.get_tag() + 1))) { - pool.destruct(head.get_ptr()); - return true; - } - } - } + } + + /** Dequeue object from queue. + * + * if pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if queue was + * empty. + * + * \note Thread-safe and non-blocking + * + * */ + bool pop(T& ret) { + for (;;) { + tagged_node_ptr head = head_.load(memory_order_acquire); + tagged_node_ptr tail = tail_.load(memory_order_acquire); + tagged_node_ptr next = head->next.load(memory_order_acquire); + node* next_ptr = next.get_ptr(); + + // tagged_node_ptr head2 = head_.load(memory_order_acquire); + // if (likely(head == head2)) + { + if (head.get_ptr() == tail.get_ptr()) { + if (next_ptr == 0) return false; + tail_.compare_exchange_strong( + tail, tagged_node_ptr(next_ptr, tail.get_tag() + 1)); + } else { + if (next_ptr == 0) + /* this check is not part of the original algorithm + * as published by michael and scott + * + * however we reuse the tagged_ptr part for the and + * clear the next part during node + * allocation. we can observe a null-pointer here. + * */ + continue; + ret = next_ptr->data; + if (head_.compare_exchange_weak( + head, tagged_node_ptr(next_ptr, head.get_tag() + 1))) { + pool.destruct(head.get_ptr()); + return true; + } } + } } - - /** Dequeue object from queue. - * - * if pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if queue was empty. - * - * \note Not thread-safe - * - * */ - bool pop_unsafe (T & ret) - { - for (;;) { - tagged_node_ptr head = head_.load(memory_order_relaxed); - tagged_node_ptr tail = tail_.load(memory_order_relaxed); - tagged_node_ptr next = head->next.load(memory_order_relaxed); - node * next_ptr = next.get_ptr(); - -// tagged_node_ptr head2 = head_.load(memory_order_relaxed); - if (head.get_ptr() == tail.get_ptr()) { - if (next_ptr == 0) - return false; - tail_.store(tagged_node_ptr(next_ptr, tail.get_tag() + 1), + } + + /** Dequeue object from queue. + * + * if pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if queue was + * empty. + * + * \note Not thread-safe + * + * */ + bool pop_unsafe(T& ret) { + for (;;) { + tagged_node_ptr head = head_.load(memory_order_relaxed); + tagged_node_ptr tail = tail_.load(memory_order_relaxed); + tagged_node_ptr next = head->next.load(memory_order_relaxed); + node* next_ptr = next.get_ptr(); + + // tagged_node_ptr head2 = head_.load(memory_order_relaxed); + if (head.get_ptr() == tail.get_ptr()) { + if (next_ptr == 0) return false; + tail_.store(tagged_node_ptr(next_ptr, tail.get_tag() + 1), memory_order_relaxed); - } else { - if (next_ptr == 0) - /* this check is not part of the original algorithm as - * published by michael and scott - * - * however we reuse the tagged_ptr part for the and clear - * the next part during node - * allocation. we can observe a null-pointer here. - * */ - continue; - ret = next_ptr->data; - head_.store(tagged_node_ptr(next_ptr, head.get_tag() + 1), + } else { + if (next_ptr == 0) + /* this check is not part of the original algorithm as + * published by michael and scott + * + * however we reuse the tagged_ptr part for the and clear + * the next part during node + * allocation. we can observe a null-pointer here. + * */ + continue; + ret = next_ptr->data; + head_.store(tagged_node_ptr(next_ptr, head.get_tag() + 1), memory_order_relaxed); - pool.destruct_unsafe(head.get_ptr()); - return true; - } - } + pool.destruct_unsafe(head.get_ptr()); + return true; + } } + } - -private: + private: #ifndef BOOST_DOXYGEN_INVOKED - atomic head_; - static const int padding_size = BOOST_LOCKFREE_CACHELINE_BYTES - - sizeof(tagged_node_ptr); - char padding1[padding_size]; - atomic tail_; - char padding2[padding_size]; - - pool_t pool; + atomic head_; + static const int padding_size = + BOOST_LOCKFREE_CACHELINE_BYTES - sizeof(tagged_node_ptr); + char padding1[padding_size]; + atomic tail_; + char padding2[padding_size]; + + pool_t pool; #endif }; @@ -333,123 +302,99 @@ class queue: * freelist_t template argument. Two different * freelists can be used. struct caching_freelist_t selects a caching freelist, * which can allocate more nodes - * from the operating system, and struct static_freelist_t uses a fixed-sized freelist. - * With a fixed-sized - * freelist, the push operation may fail, while with a caching freelist, - * the push operation may block. + * from the operating system, and struct static_freelist_t uses a fixed-sized + * freelist. With a fixed-sized freelist, the push operation may fail, while + * with a caching freelist, the push operation may block. * - * \b Limitation: The class T is required to have a trivial assignment operator. + * \b Limitation: The class T is required to have a trivial assignment + * operator. * * */ -template - > -class queue: - public detail::queue -{ - BOOST_STATIC_ASSERT(boost::has_trivial_assign::value); - -public: - //! Construct queue. - queue(void) - {} - - //! Construct queue, allocate n nodes for the freelist. - explicit queue(std::size_t n): - detail::queue(n) - {} -}; +template > +class queue : public detail::queue { + BOOST_STATIC_ASSERT(boost::has_trivial_assign::value); + public: + //! Construct queue. + queue(void) {} + + //! Construct queue, allocate n nodes for the freelist. + explicit queue(std::size_t n) : detail::queue(n) {} +}; /** Template specialization of the queue class for pointer arguments, * that supports pop operations to * stl/boost-style smart pointers * * */ -template -class queue: - public detail::queue -{ +template +class queue + : public detail::queue { #ifndef BOOST_DOXYGEN_INVOKED - typedef detail::queue queue_t; + typedef detail::queue queue_t; - template - bool pop_smart_ptr(smart_ptr & ptr) - { - T * result = nullptr; - bool success = queue_t::pop(result); + template + bool pop_smart_ptr(smart_ptr& ptr) { + T* result = nullptr; + bool success = queue_t::pop(result); - if (success) - ptr.reset(result); - return success; - } + if (success) ptr.reset(result); + return success; + } #endif -public: - //! Construct queue. - queue(void) - {} - - //! Construct queue, allocate n nodes for the freelist. - explicit queue(std::size_t n): - queue_t(n) - {} - - //! \copydoc detail::queue::pop - bool pop (T * & ret) - { - return queue_t::pop(ret); - } - - /** Dequeue object from queue to std::auto_ptr - * - * if pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if queue was empty. - * - * \note Thread-safe and non-blocking - * - * */ - bool pop (std::auto_ptr & ret) - { - return pop_smart_ptr(ret); - } - - /** Dequeue object from queue to boost::scoped_ptr - * - * if pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if queue was empty. - * - * \note Thread-safe and non-blocking - * - * */ - bool pop (boost::scoped_ptr & ret) - { - BOOST_STATIC_ASSERT(sizeof(boost::scoped_ptr) == sizeof(T*)); - return pop(reinterpret_cast(ret)); - } - - /** Dequeue object from queue to std::shared_ptr - * - * if pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if queue was empty. - * - * \note Thread-safe and non-blocking - * - * */ - bool pop (std::shared_ptr & ret) - { - return pop_smart_ptr(ret); - } + public: + //! Construct queue. + queue(void) {} + + //! Construct queue, allocate n nodes for the freelist. + explicit queue(std::size_t n) : queue_t(n) {} + + //! \copydoc detail::queue::pop + bool pop(T*& ret) { return queue_t::pop(ret); } + + /** Dequeue object from queue to std::auto_ptr + * + * if pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if queue was + * empty. + * + * \note Thread-safe and non-blocking + * + * */ + bool pop(std::auto_ptr& ret) { return pop_smart_ptr(ret); } + + /** Dequeue object from queue to boost::scoped_ptr + * + * if pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if queue was + * empty. + * + * \note Thread-safe and non-blocking + * + * */ + bool pop(boost::scoped_ptr& ret) { + BOOST_STATIC_ASSERT(sizeof(boost::scoped_ptr) == sizeof(T*)); + return pop(reinterpret_cast(ret)); + } + + /** Dequeue object from queue to std::shared_ptr + * + * if pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if queue was + * empty. + * + * \note Thread-safe and non-blocking + * + * */ + bool pop(std::shared_ptr& ret) { return pop_smart_ptr(ret); } }; } /* namespace lockfree */ diff --git a/external/lockfree/boost/lockfree/ringbuffer.hpp b/external/lockfree/boost/lockfree/ringbuffer.hpp index 62355732..63a5d16d 100644 --- a/external/lockfree/boost/lockfree/ringbuffer.hpp +++ b/external/lockfree/boost/lockfree/ringbuffer.hpp @@ -24,521 +24,473 @@ #include #include -namespace boost -{ -namespace lockfree -{ +namespace boost { +namespace lockfree { -namespace detail -{ +namespace detail { template -class ringbuffer_base: - boost::noncopyable -{ +class ringbuffer_base : boost::noncopyable { #ifndef BOOST_DOXYGEN_INVOKED - typedef std::size_t size_t; - static const int padding_size = BOOST_LOCKFREE_CACHELINE_BYTES - sizeof(size_t); - atomic write_index_; - char padding1[padding_size]; - /* force read_index and write_index to different cache lines */ - atomic read_index_; - -protected: - ringbuffer_base(void): - write_index_(0), read_index_(0) - {} - - static size_t next_index(size_t arg, size_t max_size) - { - size_t ret = arg + 1; - while (unlikely(ret >= max_size)) - ret -= max_size; - return ret; - } + typedef std::size_t size_t; + static const int padding_size = + BOOST_LOCKFREE_CACHELINE_BYTES - sizeof(size_t); + atomic write_index_; + char padding1[padding_size]; + /* force read_index and write_index to different cache lines */ + atomic read_index_; - static size_t read_available(size_t write_index, size_t read_index, - size_t max_size) - { - if (write_index >= read_index) - return write_index - read_index; + protected: + ringbuffer_base(void) : write_index_(0), read_index_(0) {} - size_t ret = write_index + max_size - read_index; - return ret; - } + static size_t next_index(size_t arg, size_t max_size) { + size_t ret = arg + 1; + while (unlikely(ret >= max_size)) ret -= max_size; + return ret; + } - static size_t write_available(size_t write_index, size_t read_index, - size_t max_size) - { - size_t ret = read_index - write_index - 1; - if (write_index >= read_index) - ret += max_size; - return ret; - } + static size_t read_available(size_t write_index, size_t read_index, + size_t max_size) { + if (write_index >= read_index) return write_index - read_index; - bool enqueue(T const & t, T * buffer, size_t max_size) - { - size_t write_index = write_index_.load(memory_order_relaxed); - // only written from enqueue thread - size_t next = next_index(write_index, max_size); + size_t ret = write_index + max_size - read_index; + return ret; + } - if (next == read_index_.load(memory_order_acquire)) - return false; /* ringbuffer is full */ + static size_t write_available(size_t write_index, size_t read_index, + size_t max_size) { + size_t ret = read_index - write_index - 1; + if (write_index >= read_index) ret += max_size; + return ret; + } - buffer[write_index] = t; + bool enqueue(T const& t, T* buffer, size_t max_size) { + size_t write_index = write_index_.load(memory_order_relaxed); + // only written from enqueue thread + size_t next = next_index(write_index, max_size); - write_index_.store(next, memory_order_release); + if (next == read_index_.load(memory_order_acquire)) + return false; /* ringbuffer is full */ - return true; - } + buffer[write_index] = t; - size_t enqueue(const T * input_buffer, size_t input_count, - T * internal_buffer, size_t max_size) - { - size_t write_index = write_index_.load(memory_order_relaxed); - // only written from enqueue thread - const size_t read_index = read_index_.load(memory_order_acquire); - const size_t avail = write_available(write_index, read_index, max_size); + write_index_.store(next, memory_order_release); - if (avail == 0) - return 0; + return true; + } - input_count = (std::min)(input_count, avail); + size_t enqueue(const T* input_buffer, size_t input_count, T* internal_buffer, + size_t max_size) { + size_t write_index = write_index_.load(memory_order_relaxed); + // only written from enqueue thread + const size_t read_index = read_index_.load(memory_order_acquire); + const size_t avail = write_available(write_index, read_index, max_size); - size_t new_write_index = write_index + input_count; + if (avail == 0) return 0; - if (write_index + input_count > max_size) { - /* copy data in two sections */ - size_t count0 = max_size - write_index; + input_count = (std::min)(input_count, avail); - std::copy(input_buffer, input_buffer + count0, + size_t new_write_index = write_index + input_count; + + if (write_index + input_count > max_size) { + /* copy data in two sections */ + size_t count0 = max_size - write_index; + + std::copy(input_buffer, input_buffer + count0, internal_buffer + write_index); - std::copy(input_buffer + count0, input_buffer + input_count, + std::copy(input_buffer + count0, input_buffer + input_count, internal_buffer); - new_write_index -= max_size; - } else { - std::copy(input_buffer, input_buffer + input_count, + new_write_index -= max_size; + } else { + std::copy(input_buffer, input_buffer + input_count, internal_buffer + write_index); - if (new_write_index == max_size) - new_write_index = 0; - } - - write_index_.store(new_write_index, memory_order_release); - return input_count; + if (new_write_index == max_size) new_write_index = 0; } - template - ConstIterator enqueue(ConstIterator begin, ConstIterator end, - T * internal_buffer, size_t max_size) - { - // FIXME: avoid std::distance and std::advance + write_index_.store(new_write_index, memory_order_release); + return input_count; + } - size_t write_index = write_index_.load(memory_order_relaxed); - // only written from enqueue thread - const size_t read_index = read_index_.load(memory_order_acquire); - const size_t avail = write_available(write_index, read_index, max_size); + template + ConstIterator enqueue(ConstIterator begin, ConstIterator end, + T* internal_buffer, size_t max_size) { + // FIXME: avoid std::distance and std::advance - if (avail == 0) - return begin; + size_t write_index = write_index_.load(memory_order_relaxed); + // only written from enqueue thread + const size_t read_index = read_index_.load(memory_order_acquire); + const size_t avail = write_available(write_index, read_index, max_size); - size_t input_count = std::distance(begin, end); - input_count = (std::min)(input_count, avail); + if (avail == 0) return begin; - size_t new_write_index = write_index + input_count; + size_t input_count = std::distance(begin, end); + input_count = (std::min)(input_count, avail); - ConstIterator last = begin; - std::advance(last, input_count); + size_t new_write_index = write_index + input_count; - if (write_index + input_count > max_size) { - /* copy data in two sections */ - size_t count0 = max_size - write_index; - ConstIterator midpoint = begin; - std::advance(midpoint, count0); + ConstIterator last = begin; + std::advance(last, input_count); - std::copy(begin, midpoint, internal_buffer + write_index); - std::copy(midpoint, last, internal_buffer); - new_write_index -= max_size; - } else { - std::copy(begin, last, internal_buffer + write_index); + if (write_index + input_count > max_size) { + /* copy data in two sections */ + size_t count0 = max_size - write_index; + ConstIterator midpoint = begin; + std::advance(midpoint, count0); - if (new_write_index == max_size) - new_write_index = 0; - } + std::copy(begin, midpoint, internal_buffer + write_index); + std::copy(midpoint, last, internal_buffer); + new_write_index -= max_size; + } else { + std::copy(begin, last, internal_buffer + write_index); - write_index_.store(new_write_index, memory_order_release); - return last; + if (new_write_index == max_size) new_write_index = 0; } - bool dequeue (T & ret, T * buffer, size_t max_size) - { - size_t write_index = write_index_.load(memory_order_acquire); - size_t read_index = read_index_.load(memory_order_relaxed); - // only written from dequeue thread - if (empty(write_index, read_index)) - return false; - - ret = buffer[read_index]; - size_t next = next_index(read_index, max_size); - read_index_.store(next, memory_order_release); - return true; - } + write_index_.store(new_write_index, memory_order_release); + return last; + } - size_t dequeue (T * output_buffer, size_t output_count, - const T * internal_buffer, size_t max_size) - { - const size_t write_index = write_index_.load(memory_order_acquire); - size_t read_index = read_index_.load(memory_order_relaxed); - // only written from dequeue thread + bool dequeue(T& ret, T* buffer, size_t max_size) { + size_t write_index = write_index_.load(memory_order_acquire); + size_t read_index = read_index_.load(memory_order_relaxed); + // only written from dequeue thread + if (empty(write_index, read_index)) return false; - const size_t avail = read_available(write_index, read_index, max_size); + ret = buffer[read_index]; + size_t next = next_index(read_index, max_size); + read_index_.store(next, memory_order_release); + return true; + } - if (avail == 0) - return 0; + size_t dequeue(T* output_buffer, size_t output_count, + const T* internal_buffer, size_t max_size) { + const size_t write_index = write_index_.load(memory_order_acquire); + size_t read_index = read_index_.load(memory_order_relaxed); + // only written from dequeue thread - output_count = (std::min)(output_count, avail); + const size_t avail = read_available(write_index, read_index, max_size); - size_t new_read_index = read_index + output_count; + if (avail == 0) return 0; - if (read_index + output_count > max_size) { - /* copy data in two sections */ - size_t count0 = max_size - read_index; - size_t count1 = output_count - count0; + output_count = (std::min)(output_count, avail); - std::copy(internal_buffer + read_index, - internal_buffer + max_size, output_buffer); - std::copy(internal_buffer, internal_buffer + count1, output_buffer + count0); + size_t new_read_index = read_index + output_count; - new_read_index -= max_size; - } else { - std::copy(internal_buffer + read_index, - internal_buffer + read_index + output_count, output_buffer); - if (new_read_index == max_size) - new_read_index = 0; - } + if (read_index + output_count > max_size) { + /* copy data in two sections */ + size_t count0 = max_size - read_index; + size_t count1 = output_count - count0; - read_index_.store(new_read_index, memory_order_release); - return output_count; + std::copy(internal_buffer + read_index, internal_buffer + max_size, + output_buffer); + std::copy(internal_buffer, internal_buffer + count1, + output_buffer + count0); + + new_read_index -= max_size; + } else { + std::copy(internal_buffer + read_index, + internal_buffer + read_index + output_count, output_buffer); + if (new_read_index == max_size) new_read_index = 0; } - template - size_t dequeue (OutputIterator it, const T * internal_buffer, size_t max_size) - { - const size_t write_index = write_index_.load(memory_order_acquire); - size_t read_index = read_index_.load(memory_order_relaxed); - // only written from dequeue thread + read_index_.store(new_read_index, memory_order_release); + return output_count; + } - const size_t avail = read_available(write_index, read_index, max_size); - if (avail == 0) - return 0; + template + size_t dequeue(OutputIterator it, const T* internal_buffer, size_t max_size) { + const size_t write_index = write_index_.load(memory_order_acquire); + size_t read_index = read_index_.load(memory_order_relaxed); + // only written from dequeue thread - size_t new_read_index = read_index + avail; + const size_t avail = read_available(write_index, read_index, max_size); + if (avail == 0) return 0; - if (read_index + avail > max_size) { - /* copy data in two sections */ - size_t count0 = max_size - read_index; - size_t count1 = avail - count0; + size_t new_read_index = read_index + avail; - std::copy(internal_buffer + read_index, internal_buffer + max_size, it); - std::copy(internal_buffer, internal_buffer + count1, it); + if (read_index + avail > max_size) { + /* copy data in two sections */ + size_t count0 = max_size - read_index; + size_t count1 = avail - count0; - new_read_index -= max_size; - } else { - std::copy(internal_buffer + read_index, - internal_buffer + read_index + avail, it); - if (new_read_index == max_size) - new_read_index = 0; - } + std::copy(internal_buffer + read_index, internal_buffer + max_size, it); + std::copy(internal_buffer, internal_buffer + count1, it); - read_index_.store(new_read_index, memory_order_release); - return avail; - } -#endif - - -public: - /** reset the ringbuffer - * - * \warning Not thread-safe, use for debugging purposes only - * */ - void reset(void) - { - write_index_.store(0, memory_order_relaxed); - read_index_.store(0, memory_order_release); - } - - /** Check if the ringbuffer is empty - * - * \warning Not thread-safe, use for debugging purposes only - * */ - bool empty(void) - { - return empty(write_index_.load(memory_order_relaxed), - read_index_.load(memory_order_relaxed)); + new_read_index -= max_size; + } else { + std::copy(internal_buffer + read_index, + internal_buffer + read_index + avail, it); + if (new_read_index == max_size) new_read_index = 0; } - //! \copydoc boost::lockfree::stack::is_lock_free - bool is_lock_free(void) const - { - return write_index_.is_lock_free() && read_index_.is_lock_free(); - } + read_index_.store(new_read_index, memory_order_release); + return avail; + } +#endif -private: - bool empty(size_t write_index, size_t read_index) - { - return write_index == read_index; - } + public: + /** reset the ringbuffer + * + * \warning Not thread-safe, use for debugging purposes only + * */ + void reset(void) { + write_index_.store(0, memory_order_relaxed); + read_index_.store(0, memory_order_release); + } + + /** Check if the ringbuffer is empty + * + * \warning Not thread-safe, use for debugging purposes only + * */ + bool empty(void) { + return empty(write_index_.load(memory_order_relaxed), + read_index_.load(memory_order_relaxed)); + } + + //! \copydoc boost::lockfree::stack::is_lock_free + bool is_lock_free(void) const { + return write_index_.is_lock_free() && read_index_.is_lock_free(); + } + + private: + bool empty(size_t write_index, size_t read_index) { + return write_index == read_index; + } }; } /* namespace detail */ template -class ringbuffer: - public detail::ringbuffer_base -{ - typedef std::size_t size_t; - boost::array array_; - -public: - /** Enqueues object t to the ringbuffer. - * Enqueueing may fail, if the ringbuffer is full. - * - * \return true, if the enqueue operation is successful. - * - * \note Thread-safe and non-blocking - * */ - bool enqueue(T const & t) - { - return detail::ringbuffer_base::enqueue(t, array_.c_array(), max_size); - } - - /** Dequeue object from ringbuffer. - * - * If dequeue operation is successful, - * object is written to memory location denoted by ret. - * - * \return true, if the dequeue operation is successful, - * false if ringbuffer was empty. - * - * \note Thread-safe and non-blocking - */ - bool dequeue(T & ret) - { - return detail::ringbuffer_base::dequeue(ret, array_.c_array(), max_size); - } - - /** Enqueues size objects from the array t to the ringbuffer. - * - * Will enqueue as many objects as there is space available - * - * \Returns number of enqueued items - * - * \note Thread-safe and non-blocking - */ - size_t enqueue(T const * t, size_t size) - { - return detail::ringbuffer_base::enqueue(t, size, array_.c_array(), max_size); - } - - /** Enqueues all objects from the array t to the ringbuffer. - * - * Will enqueue as many objects as there is space available - * - * \Returns number of enqueued items - * - * \note Thread-safe and non-blocking - */ - template - size_t enqueue(T const (&t)[size]) - { - return enqueue(t, size); - } - - /** Enqueues size objects from the iterator range [begin, end[ to the ringbuffer. - * - * Enqueueing may fail, if the ringbuffer is full. - * - * \return iterator to the first element, which has not been enqueued - * - * \note Thread-safe and non-blocking - */ - template - ConstIterator enqueue(ConstIterator begin, ConstIterator end) - { - return detail::ringbuffer_base::enqueue(begin, end, - array_.c_array(), max_size); - } - - /** Dequeue a maximum of size objects from ringbuffer. - * - * If dequeue operation is successful, - * object is written to memory location denoted by ret. - * - * \return number of dequeued items - * - * \note Thread-safe and non-blocking - * */ - /* @{ */ - size_t dequeue(T * ret, size_t size) - { - return detail::ringbuffer_base::dequeue(ret, size, - array_.c_array(), max_size); - } - - /** Enqueues all objects from the array t to the ringbuffer. - * - * Will enqueue as many objects as there is space available - * - * \Returns number of enqueued items - * - * \note Thread-safe and non-blocking - */ - template - size_t dequeue(T (&t)[size]) - { - return dequeue(t, size); - } - - /** Dequeue objects to the output iterator it - * - * \return number of dequeued items - * - * \note Thread-safe and non-blocking - * */ - template - size_t dequeue(OutputIterator it) - { - return detail::ringbuffer_base::dequeue(it, array_.c_array(), max_size); - } +class ringbuffer : public detail::ringbuffer_base { + typedef std::size_t size_t; + boost::array array_; + + public: + /** Enqueues object t to the ringbuffer. + * Enqueueing may fail, if the ringbuffer is full. + * + * \return true, if the enqueue operation is successful. + * + * \note Thread-safe and non-blocking + * */ + bool enqueue(T const& t) { + return detail::ringbuffer_base::enqueue(t, array_.c_array(), max_size); + } + + /** Dequeue object from ringbuffer. + * + * If dequeue operation is successful, + * object is written to memory location denoted by ret. + * + * \return true, if the dequeue operation is successful, + * false if ringbuffer was empty. + * + * \note Thread-safe and non-blocking + */ + bool dequeue(T& ret) { + return detail::ringbuffer_base::dequeue(ret, array_.c_array(), max_size); + } + + /** Enqueues size objects from the array t to the ringbuffer. + * + * Will enqueue as many objects as there is space available + * + * \Returns number of enqueued items + * + * \note Thread-safe and non-blocking + */ + size_t enqueue(T const* t, size_t size) { + return detail::ringbuffer_base::enqueue(t, size, array_.c_array(), + max_size); + } + + /** Enqueues all objects from the array t to the ringbuffer. + * + * Will enqueue as many objects as there is space available + * + * \Returns number of enqueued items + * + * \note Thread-safe and non-blocking + */ + template + size_t enqueue(T const (&t)[size]) { + return enqueue(t, size); + } + + /** Enqueues size objects from the iterator range [begin, end[ to the + * ringbuffer. + * + * Enqueueing may fail, if the ringbuffer is full. + * + * \return iterator to the first element, which has not been enqueued + * + * \note Thread-safe and non-blocking + */ + template + ConstIterator enqueue(ConstIterator begin, ConstIterator end) { + return detail::ringbuffer_base::enqueue(begin, end, array_.c_array(), + max_size); + } + + /** Dequeue a maximum of size objects from ringbuffer. + * + * If dequeue operation is successful, + * object is written to memory location denoted by ret. + * + * \return number of dequeued items + * + * \note Thread-safe and non-blocking + * */ + /* @{ */ + size_t dequeue(T* ret, size_t size) { + return detail::ringbuffer_base::dequeue(ret, size, array_.c_array(), + max_size); + } + + /** Enqueues all objects from the array t to the ringbuffer. + * + * Will enqueue as many objects as there is space available + * + * \Returns number of enqueued items + * + * \note Thread-safe and non-blocking + */ + template + size_t dequeue(T (&t)[size]) { + return dequeue(t, size); + } + + /** Dequeue objects to the output iterator it + * + * \return number of dequeued items + * + * \note Thread-safe and non-blocking + * */ + template + size_t dequeue(OutputIterator it) { + return detail::ringbuffer_base::dequeue(it, array_.c_array(), max_size); + } }; template -class ringbuffer: - public detail::ringbuffer_base -{ - typedef std::size_t size_t; - size_t max_size_; - scoped_array array_; - -public: - //! Constructs a ringbuffer for max_size elements - explicit ringbuffer(size_t max_size): - max_size_(max_size), array_(new T[max_size]) - {} - - /** Enqueues object t to the ringbuffer. - * Enqueueing may fail, if the ringbuffer is full. - * - * \return true, if the enqueue operation is successful. - * - * \note Thread-safe and non-blocking - * */ - bool enqueue(T const & t) - { - return detail::ringbuffer_base::enqueue(t, array_.get(), max_size_); - } - - /** Dequeue object from ringbuffer. - * - * If dequeue operation is successful, - * object is written to memory location denoted by ret. - * - * \return true, if the dequeue operation is successful, - * false if ringbuffer was empty. - * - * \note Thread-safe and non-blocking - */ - bool dequeue(T & ret) - { - return detail::ringbuffer_base::dequeue(ret, array_.get(), max_size_); - } - - /** Enqueues size objects from the array t to the ringbuffer. - * - * Will enqueue as many objects as there is space available - * - * \Returns number of enqueued items - * - * \note Thread-safe and non-blocking - */ - size_t enqueue(T const * t, size_t size) - { - return detail::ringbuffer_base::enqueue(t, size, array_.get(), max_size_); - } - - /** Enqueues all objects from the array t to the ringbuffer. - * - * Will enqueue as many objects as there is space available - * - * \Returns number of enqueued items - * - * \note Thread-safe and non-blocking - */ - template - size_t enqueue(T const (&t)[size]) - { - return enqueue(t, size); - } - - /** Enqueues size objects from the iterator range [begin, end[ to the ringbuffer. - * - * Enqueueing may fail, if the ringbuffer is full. - * - * \return iterator to the first element, which has not been enqueued - * - * \note Thread-safe and non-blocking - */ - template - ConstIterator enqueue(ConstIterator begin, ConstIterator end) - { - return detail::ringbuffer_base::enqueue(begin, end, array_.get(), max_size_); - } - - /** Dequeue a maximum of size objects from ringbuffer. - * - * If dequeue operation is successful, object is written to memory - * location denoted by ret. - * - * \return number of dequeued items - * - * \note Thread-safe and non-blocking - * */ - size_t dequeue(T * ret, size_t size) - { - return detail::ringbuffer_base::dequeue(ret, size, array_.get(), max_size_); - } - - /** Dequeue objects from ringbuffer. - * - * If dequeue operation is successful, - object is written to memory location denoted by ret. - * - * \return number of dequeued items - * - * \note Thread-safe and non-blocking - * */ - template - size_t dequeue(T (&t)[size]) - { - return dequeue(t, size); - } - - /** Dequeue objects to the output iterator it - * - * \return number of dequeued items - * - * \note Thread-safe and non-blocking - * */ - template - size_t dequeue(OutputIterator it) - { - return detail::ringbuffer_base::dequeue(it, array_.get(), max_size_); - } +class ringbuffer : public detail::ringbuffer_base { + typedef std::size_t size_t; + size_t max_size_; + scoped_array array_; + + public: + //! Constructs a ringbuffer for max_size elements + explicit ringbuffer(size_t max_size) + : max_size_(max_size), array_(new T[max_size]) {} + + /** Enqueues object t to the ringbuffer. + * Enqueueing may fail, if the ringbuffer is full. + * + * \return true, if the enqueue operation is successful. + * + * \note Thread-safe and non-blocking + * */ + bool enqueue(T const& t) { + return detail::ringbuffer_base::enqueue(t, array_.get(), max_size_); + } + + /** Dequeue object from ringbuffer. + * + * If dequeue operation is successful, + * object is written to memory location denoted by ret. + * + * \return true, if the dequeue operation is successful, + * false if ringbuffer was empty. + * + * \note Thread-safe and non-blocking + */ + bool dequeue(T& ret) { + return detail::ringbuffer_base::dequeue(ret, array_.get(), max_size_); + } + + /** Enqueues size objects from the array t to the ringbuffer. + * + * Will enqueue as many objects as there is space available + * + * \Returns number of enqueued items + * + * \note Thread-safe and non-blocking + */ + size_t enqueue(T const* t, size_t size) { + return detail::ringbuffer_base::enqueue(t, size, array_.get(), + max_size_); + } + + /** Enqueues all objects from the array t to the ringbuffer. + * + * Will enqueue as many objects as there is space available + * + * \Returns number of enqueued items + * + * \note Thread-safe and non-blocking + */ + template + size_t enqueue(T const (&t)[size]) { + return enqueue(t, size); + } + + /** Enqueues size objects from the iterator range [begin, end[ to the + * ringbuffer. + * + * Enqueueing may fail, if the ringbuffer is full. + * + * \return iterator to the first element, which has not been enqueued + * + * \note Thread-safe and non-blocking + */ + template + ConstIterator enqueue(ConstIterator begin, ConstIterator end) { + return detail::ringbuffer_base::enqueue(begin, end, array_.get(), + max_size_); + } + + /** Dequeue a maximum of size objects from ringbuffer. + * + * If dequeue operation is successful, object is written to memory + * location denoted by ret. + * + * \return number of dequeued items + * + * \note Thread-safe and non-blocking + * */ + size_t dequeue(T* ret, size_t size) { + return detail::ringbuffer_base::dequeue(ret, size, array_.get(), + max_size_); + } + + /** Dequeue objects from ringbuffer. + * + * If dequeue operation is successful, + object is written to memory location denoted by ret. + * + * \return number of dequeued items + * + * \note Thread-safe and non-blocking + * */ + template + size_t dequeue(T (&t)[size]) { + return dequeue(t, size); + } + + /** Dequeue objects to the output iterator it + * + * \return number of dequeued items + * + * \note Thread-safe and non-blocking + * */ + template + size_t dequeue(OutputIterator it) { + return detail::ringbuffer_base::dequeue(it, array_.get(), max_size_); + } }; - } /* namespace lockfree */ } /* namespace boost */ - #endif /* BOOST_LOCKFREE_RINGBUFFER_HPP_INCLUDED */ diff --git a/external/lockfree/boost/lockfree/stack.hpp b/external/lockfree/boost/lockfree/stack.hpp index e0a8881f..30923413 100644 --- a/external/lockfree/boost/lockfree/stack.hpp +++ b/external/lockfree/boost/lockfree/stack.hpp @@ -35,228 +35,198 @@ namespace lockfree { * its freelist_t template argument. Two different * freelists can be used. struct caching_freelist_t selects a caching freelist, * which can allocate more nodes - * from the operating system, and struct static_freelist_t uses a fixed-sized freelist. - * With a fixed-sized - * freelist, the push operation may fail, while with a caching freelist, - * the push operation may block. + * from the operating system, and struct static_freelist_t uses a fixed-sized + * freelist. With a fixed-sized freelist, the push operation may fail, while + * with a caching freelist, the push operation may block. * - * \b Limitation: The class T is required to have a trivial assignment operator. + * \b Limitation: The class T is required to have a trivial assignment + * operator. * */ -template - > -class stack: - boost::noncopyable -{ -private: - BOOST_STATIC_ASSERT(boost::has_trivial_assign::value); +template > +class stack : boost::noncopyable { + private: + BOOST_STATIC_ASSERT(boost::has_trivial_assign::value); #ifndef BOOST_DOXYGEN_INVOKED - struct node - { - typedef detail::tagged_ptr tagged_node_ptr; - - node(T const & v): - v(v) - {} - - tagged_node_ptr next; - T v; - }; -#endif - + struct node { typedef detail::tagged_ptr tagged_node_ptr; - typedef typename Alloc::template rebind::other node_allocator; - - typedef typename boost::mpl::if_, - detail::freelist_stack, - detail::freelist_stack - >::type pool_t; - -public: - /** - * \return true, if implementation is lock-free. - * */ - bool is_lock_free (void) const - { - return tos.is_lock_free() && pool.is_lock_free(); - } - - //! Construct stack. - stack(void): - tos(tagged_node_ptr(nullptr, 0)) - {} - - //! Construct stack, allocate n nodes for the freelist - explicit stack(std::size_t n): - tos(tagged_node_ptr(nullptr, 0)) - { - pool.reserve_unsafe(n); - } - - //! Allocate n nodes for freelist - void reserve(std::size_t n) - { - pool.reserve(n); - } - - /** \copydoc boost::lockfree::stack::reserve - * - * \note not thread-safe - * - * */ - void reserve_unsafe(std::size_t n) - { - pool.reserve_unsafe(n); - } + node(T const& v) : v(v) {} - /** Destroys stack, free all nodes from freelist. - * - * \note not thread-safe - * - * */ - ~stack(void) - { - if (!empty()) { - T dummy; - while(pop_unsafe(dummy)) - ; - } - } + tagged_node_ptr next; + T v; + }; +#endif - /** Pushes object t to the queue. - * May fail, if the freelist is not able to allocate a new queue node. - * - * \returns true, if the push operation is successful. - * - * \note Thread-safe and non-blocking - * \warning \b Warning: - * May block if node needs to be allocated from the operating system - * */ - bool push(T const & v) - { - node * newnode = pool.construct(v); - - if (newnode == 0) - return false; - - tagged_node_ptr old_tos = tos.load(detail::memory_order_relaxed); - - for (;;) { - tagged_node_ptr new_tos (newnode, old_tos.get_tag()); - newnode->next.set_ptr(old_tos.get_ptr()); - - if (tos.compare_exchange_weak(old_tos, new_tos)) - return true; - } + typedef detail::tagged_ptr tagged_node_ptr; + + typedef typename Alloc::template rebind::other node_allocator; + + typedef typename boost::mpl::if_< + boost::is_same, + detail::freelist_stack, + detail::freelist_stack >::type pool_t; + + public: + /** + * \return true, if implementation is lock-free. + * */ + bool is_lock_free(void) const { + return tos.is_lock_free() && pool.is_lock_free(); + } + + //! Construct stack. + stack(void) : tos(tagged_node_ptr(nullptr, 0)) {} + + //! Construct stack, allocate n nodes for the freelist + explicit stack(std::size_t n) : tos(tagged_node_ptr(nullptr, 0)) { + pool.reserve_unsafe(n); + } + + //! Allocate n nodes for freelist + void reserve(std::size_t n) { pool.reserve(n); } + + /** \copydoc boost::lockfree::stack::reserve + * + * \note not thread-safe + * + * */ + void reserve_unsafe(std::size_t n) { pool.reserve_unsafe(n); } + + /** Destroys stack, free all nodes from freelist. + * + * \note not thread-safe + * + * */ + ~stack(void) { + if (!empty()) { + T dummy; + while (pop_unsafe(dummy)) + ; } + } - /** Pushes object t to the queue. - * May fail, if the freelist is not able to allocate a new queue node. - * - * \returns true, if the push operation is successful. - * - * \note Not thread-safe - * \warning \b Warning: - * May block if node needs to be allocated from the operating system - * */ - bool push_unsafe(T const & v) - { - node * newnode = pool.construct_unsafe(v); - - if (newnode == 0) - return false; - - tagged_node_ptr old_tos = tos.load(detail::memory_order_relaxed); + /** Pushes object t to the queue. + * May fail, if the freelist is not able to allocate a new queue node. + * + * \returns true, if the push operation is successful. + * + * \note Thread-safe and non-blocking + * \warning \b Warning: + * May block if node needs to be allocated from the operating system + * */ + bool push(T const& v) { + node* newnode = pool.construct(v); - tagged_node_ptr new_tos (newnode, old_tos.get_tag()); - newnode->next.set_ptr(old_tos.get_ptr()); + if (newnode == 0) return false; - tos.store(new_tos, memory_order_relaxed); - return true; - } + tagged_node_ptr old_tos = tos.load(detail::memory_order_relaxed); + for (;;) { + tagged_node_ptr new_tos(newnode, old_tos.get_tag()); + newnode->next.set_ptr(old_tos.get_ptr()); - /** Pops object from stack. - * - * If pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if stack was empty. - * - * \note Thread-safe and non-blocking - * - * */ - bool pop(T & ret) - { - tagged_node_ptr old_tos = tos.load(detail::memory_order_consume); - - for (;;) { - if (!old_tos.get_ptr()) - return false; - - node * new_tos_ptr = old_tos->next.get_ptr(); - tagged_node_ptr new_tos(new_tos_ptr, old_tos.get_tag() + 1); - - if (tos.compare_exchange_weak(old_tos, new_tos)) { - ret = old_tos->v; - pool.destruct(old_tos.get_ptr()); - return true; - } - } + if (tos.compare_exchange_weak(old_tos, new_tos)) return true; } - - /** Pops object from stack. - * - * If pop operation is successful, - * object is written to memory location denoted by ret. - * - * \returns true, if the pop operation is successful, false if stack was empty. - * - * \note Not thread-safe - * - * */ - bool pop_unsafe(T & ret) - { - tagged_node_ptr old_tos = tos.load(detail::memory_order_relaxed); - - if (!old_tos.get_ptr()) - return false; - - node * new_tos_ptr = old_tos->next.get_ptr(); - tagged_node_ptr new_tos(new_tos_ptr, old_tos.get_tag() + 1); - - tos.store(new_tos, memory_order_relaxed); + } + + /** Pushes object t to the queue. + * May fail, if the freelist is not able to allocate a new queue node. + * + * \returns true, if the push operation is successful. + * + * \note Not thread-safe + * \warning \b Warning: + * May block if node needs to be allocated from the operating system + * */ + bool push_unsafe(T const& v) { + node* newnode = pool.construct_unsafe(v); + + if (newnode == 0) return false; + + tagged_node_ptr old_tos = tos.load(detail::memory_order_relaxed); + + tagged_node_ptr new_tos(newnode, old_tos.get_tag()); + newnode->next.set_ptr(old_tos.get_ptr()); + + tos.store(new_tos, memory_order_relaxed); + return true; + } + + /** Pops object from stack. + * + * If pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if stack was + * empty. + * + * \note Thread-safe and non-blocking + * + * */ + bool pop(T& ret) { + tagged_node_ptr old_tos = tos.load(detail::memory_order_consume); + + for (;;) { + if (!old_tos.get_ptr()) return false; + + node* new_tos_ptr = old_tos->next.get_ptr(); + tagged_node_ptr new_tos(new_tos_ptr, old_tos.get_tag() + 1); + + if (tos.compare_exchange_weak(old_tos, new_tos)) { ret = old_tos->v; - pool.destruct_unsafe(old_tos.get_ptr()); + pool.destruct(old_tos.get_ptr()); return true; + } } - - /** - * \return true, if stack is empty. - * - * \warning The state of the stack can be modified by other threads - * - * \note While this function is thread-safe, - * it only guarantees that at some point during the execution of the function the - * stack has been empty - * */ - bool empty(void) const - { - return tos.load().get_ptr() == nullptr; - } - -private: + } + + /** Pops object from stack. + * + * If pop operation is successful, + * object is written to memory location denoted by ret. + * + * \returns true, if the pop operation is successful, false if stack was + * empty. + * + * \note Not thread-safe + * + * */ + bool pop_unsafe(T& ret) { + tagged_node_ptr old_tos = tos.load(detail::memory_order_relaxed); + + if (!old_tos.get_ptr()) return false; + + node* new_tos_ptr = old_tos->next.get_ptr(); + tagged_node_ptr new_tos(new_tos_ptr, old_tos.get_tag() + 1); + + tos.store(new_tos, memory_order_relaxed); + ret = old_tos->v; + pool.destruct_unsafe(old_tos.get_ptr()); + return true; + } + + /** + * \return true, if stack is empty. + * + * \warning The state of the stack can be modified by other threads + * + * \note While this function is thread-safe, + * it only guarantees that at some point during the execution of the function + * the stack has been empty + * */ + bool empty(void) const { return tos.load().get_ptr() == nullptr; } + + private: #ifndef BOOST_DOXYGEN_INVOKED - detail::atomic tos; + detail::atomic tos; - static const int padding_size = BOOST_LOCKFREE_CACHELINE_BYTES - - sizeof(tagged_node_ptr); - char padding[padding_size]; + static const int padding_size = + BOOST_LOCKFREE_CACHELINE_BYTES - sizeof(tagged_node_ptr); + char padding[padding_size]; - pool_t pool; + pool_t pool; #endif }; diff --git a/format.sh b/format.sh new file mode 100755 index 00000000..cd4f1ad4 --- /dev/null +++ b/format.sh @@ -0,0 +1,22 @@ +#!/bin/bash + +############################################################################## +# Copyright (c) 2019 Prashant K. Jha +# Copyright (c) 2019 Patrick Diehl +# +# Distributed under the Boost Software License, Version 1.0. (See accompanying +# file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +############################################################################## + +if [ ! -f ".clang-format" ]; then + + echo "Generate clang format configuration file" + clang-format -style=Google -dump-config > .clang-format + sed -i 's/SortIncludes: true/SortIncludes: false/g' .clang-format +fi + +echo "Formating header files" +find -name "*.hpp" -exec clang-format -i "{}" ";" + +echo "Formating source files" +find -name "*.cpp" -exec clang-format -i "{}" ";" diff --git a/hpxcl/cuda.hpp b/hpxcl/cuda.hpp index 3bec4d9f..1a6aefb3 100644 --- a/hpxcl/cuda.hpp +++ b/hpxcl/cuda.hpp @@ -2,7 +2,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #ifndef HPX_CUDA_HPP_ #define HPX_CUDA_HPP_ #include "cuda/device.hpp" @@ -21,7 +20,6 @@ * Example for building a kernel */ - /** * \example build_kernel_from_file.cpp * Example for for building a kernel from a file diff --git a/hpxcl/opencl.hpp b/hpxcl/opencl.hpp index 57e50459..c1b3b85d 100644 --- a/hpxcl/opencl.hpp +++ b/hpxcl/opencl.hpp @@ -7,11 +7,11 @@ #ifndef HPX_OPENCL_HPP_ #define HPX_OPENCL_HPP_ - #include "opencl/device.hpp" - #include "opencl/create_devices.hpp" - #include "opencl/buffer.hpp" - #include "opencl/program.hpp" - #include "opencl/kernel.hpp" +#include "opencl/device.hpp" +#include "opencl/create_devices.hpp" +#include "opencl/buffer.hpp" +#include "opencl/program.hpp" +#include "opencl/kernel.hpp" #endif diff --git a/opencl/buffer.cpp b/opencl/buffer.cpp index 8f87d6ae..c2c23e35 100644 --- a/opencl/buffer.cpp +++ b/opencl/buffer.cpp @@ -14,120 +14,91 @@ using hpx::opencl::buffer; -hpx::future -buffer::size() const -{ +hpx::future buffer::size() const { + HPX_ASSERT(this->get_id()); + typedef hpx::opencl::server::buffer::size_action func; - HPX_ASSERT(this->get_id()); - typedef hpx::opencl::server::buffer::size_action func; - - return hpx::async(this->get_id()); + return hpx::async(this->get_id()); +} +void buffer::ensure_device_id() { + if (!device_gid) { + typedef hpx::opencl::server::buffer::get_parent_device_id_action + action_type; + HPX_ASSERT(this->get_id()); + device_gid = async(this->get_id()).get(); + } } -void buffer::ensure_device_id() -{ - if (!device_gid) - { - typedef - hpx::opencl::server::buffer::get_parent_device_id_action - action_type; - HPX_ASSERT(this->get_id()); - device_gid = async(this->get_id()).get(); - } +buffer::send_result buffer::enqueue_send_impl( + const hpx::opencl::buffer &dst, std::size_t &&src_offset, + std::size_t &&dst_offset, std::size_t &&size, + hpx::opencl::util::resolved_events &&dependencies) { + ensure_device_id(); + + using hpx::opencl::lcos::event; + HPX_ASSERT(this->get_id()); + HPX_ASSERT(dependencies.are_from_devices(device_gid, dst.device_gid)); + + // create events + event src_event(device_gid); + event dst_event(dst.device_gid); + + // send command to server class + typedef hpx::opencl::server::buffer::enqueue_send_action func; + hpx::apply(this->get_id(), dst.get_id(), src_event.get_event_id(), + dst_event.get_event_id(), src_offset, dst_offset, size, + std::move(dependencies.event_ids), + std::move(dependencies.device_ids)); + + // return futures + return send_result(std::move(src_event.get_future()), + std::move(dst_event.get_future())); } -buffer::send_result -buffer::enqueue_send_impl( - const hpx::opencl::buffer &dst, - std::size_t && src_offset, - std::size_t && dst_offset, - std::size_t && size, - hpx::opencl::util::resolved_events && dependencies ) -{ - ensure_device_id(); - - using hpx::opencl::lcos::event; - HPX_ASSERT(this->get_id()); - HPX_ASSERT(dependencies.are_from_devices(device_gid, dst.device_gid)); - - // create events - event src_event( device_gid ); - event dst_event( dst.device_gid ); - - // send command to server class - typedef hpx::opencl::server::buffer::enqueue_send_action func; - hpx::apply( this->get_id(), - dst.get_id(), - src_event.get_event_id(), - dst_event.get_event_id(), - src_offset, - dst_offset, - size, - std::move(dependencies.event_ids), - std::move(dependencies.device_ids) ); - - // return futures - return send_result( std::move(src_event.get_future()), - std::move(dst_event.get_future()) ); +buffer::send_result buffer::enqueue_send_rect_impl( + const hpx::opencl::buffer &dst, hpx::opencl::rect_props &&rect_properties, + hpx::opencl::util::resolved_events &&dependencies) { + ensure_device_id(); + + using hpx::opencl::lcos::event; + HPX_ASSERT(this->get_id()); + HPX_ASSERT(dependencies.are_from_devices(device_gid, dst.device_gid)); + + // create events + event src_event(device_gid); + event dst_event(dst.device_gid); + + // send command to server class + typedef hpx::opencl::server::buffer::enqueue_send_rect_action func; + hpx::apply(this->get_id(), dst.get_id(), src_event.get_event_id(), + dst_event.get_event_id(), rect_properties, + std::move(dependencies.event_ids), + std::move(dependencies.device_ids)); + + // return futures + return send_result(std::move(src_event.get_future()), + std::move(dst_event.get_future())); } -buffer::send_result -buffer::enqueue_send_rect_impl( - const hpx::opencl::buffer &dst, - hpx::opencl::rect_props && rect_properties, - hpx::opencl::util::resolved_events && dependencies ) -{ - ensure_device_id(); +hpx::future > +buffer::enqueue_read_impl(std::size_t &&offset, std::size_t &&size, + hpx::opencl::util::resolved_events &&dependencies) { + ensure_device_id(); - using hpx::opencl::lcos::event; - HPX_ASSERT(this->get_id()); - HPX_ASSERT(dependencies.are_from_devices(device_gid, dst.device_gid)); - - // create events - event src_event( device_gid ); - event dst_event( dst.device_gid ); - - // send command to server class - typedef hpx::opencl::server::buffer::enqueue_send_rect_action func; - hpx::apply( this->get_id(), - dst.get_id(), - src_event.get_event_id(), - dst_event.get_event_id(), - rect_properties, - std::move(dependencies.event_ids), - std::move(dependencies.device_ids) ); - - // return futures - return send_result( std::move(src_event.get_future()), - std::move(dst_event.get_future()) ); -} + using hpx::opencl::lcos::event; + typedef hpx::serialization::serialize_buffer buffer_type; + HPX_ASSERT(dependencies.are_from_device(device_gid)); -hpx::future > -buffer::enqueue_read_impl( - std::size_t && offset, - std::size_t && size, - hpx::opencl::util::resolved_events && dependencies ) -{ - ensure_device_id(); - - using hpx::opencl::lcos::event; - typedef hpx::serialization::serialize_buffer buffer_type; - - HPX_ASSERT(dependencies.are_from_device(device_gid)); - - // create local event - event ev( device_gid ); - - // send command to server class - typedef hpx::opencl::server::buffer::enqueue_read_action func; - hpx::apply( this->get_id(), - ev.get_event_id(), - offset, - size, - std::move(dependencies.event_ids) ); - - // return future connected to event - return ev.get_future(); + // create local event + event ev(device_gid); + + // send command to server class + typedef hpx::opencl::server::buffer::enqueue_read_action func; + hpx::apply(this->get_id(), ev.get_event_id(), offset, size, + std::move(dependencies.event_ids)); + + // return future connected to event + return ev.get_future(); } diff --git a/opencl/buffer.hpp b/opencl/buffer.hpp index e5bb7627..e93562f1 100644 --- a/opencl/buffer.hpp +++ b/opencl/buffer.hpp @@ -26,463 +26,415 @@ namespace hpx { namespace opencl { - ////////////////////////////////////// - /// @brief Device memory. - /// - /// Every buffer belongs to one \ref device. - /// - class HPX_OPENCL_EXPORT buffer - : public hpx::components::client_base - { - - typedef hpx::components::client_base base_type; - - public: - // the result struct for enqueue_send - struct send_result{ - public: - send_result( hpx::future&& fut1, - hpx::future&& fut2 ) - : src_future(std::move(fut1)), - dst_future(std::move(fut2)){}; - - hpx::future< void > src_future; - hpx::future< void > dst_future; - }; - - - public: - // Empty constructor, necessary for hpx purposes - buffer() {} - - // Constructor - buffer(hpx::shared_future const& gid, - hpx::naming::id_type device_gid_) - : base_type(gid), device_gid(std::move(device_gid_)) - { - is_local = - (hpx::get_colocation_id(hpx::launch::sync, get_id()) == hpx::find_here()); - } - - buffer(hpx::future && gid) - : base_type(std::move(gid)), device_gid() - { - is_local = - (hpx::get_colocation_id(hpx::launch::sync, get_id()) == hpx::find_here()); - } - - // initialization - - - // /////////////////////////////////////////////// - // Exposed Component functionality - // - - /** - * @brief Get the size of the buffer - * - * @return The size of the buffer - */ - hpx::future - size() const; - - /** - * @brief Writes data to the buffer - * - * @param offset The start position of the area to write to. - * @param data The data to be written. - * @return An future that can be used for synchronization or - * dependency for other calls. - */ - template - hpx::future - enqueue_write( std::size_t offset, - const hpx::serialization::serialize_buffer data, - Deps &&... dependencies ); - - /** - * @brief Writes data to the buffer in a rectangular region - * - * @param rect_properties The parameters like size, offset, stride - * @param data The data to be written. - * - * @return An future that can be used for synchronization or - * dependency for other calls. - */ - template - hpx::future - enqueue_write_rect( - rect_props rect_properties, - const hpx::serialization::serialize_buffer data, - Deps &&... dependencies ); - - /** - * @brief Reads data from the buffer - * - * @param offset The start position of the area to read. - * @param size The size of the area to read. - * @return A future that can be used for synchronization or - * dependency for other calls. - * Contains the result buffer of the call. - */ - template - hpx::future > - enqueue_read( std::size_t offset, - std::size_t size, - Deps &&... dependencies ); - - /** - * @brief Reads data from the buffer - * - * @param offset The start position of the area to read. - * @param data The buffer the result will get written to. - * The buffer also contains information about the - * size of the data to read. - * The buffer will get returned and kept alive - * through the future. - * @return A future that can be used for synchronization or - * dependency for other calls. - * Contains the 'data' parameter with the result - * written to. - */ - template - hpx::future > - enqueue_read( std::size_t offset, - hpx::serialization::serialize_buffer data, - Deps &&... dependencies ); - - /** - * @brief Reads data from the buffer - * - * @param rect_properties Parameters of the rectangle to read. - * @param data The buffer the result will get written to. - * The buffer will get returned and kept alive - * through the future. - * @return A future that can be used for synchronization or - * dependency for other calls. - * Contains the 'data' parameter with the result - * written to. - */ - template - hpx::future > - enqueue_read_rect( rect_props rect_properties, - hpx::serialization::serialize_buffer data, - Deps &&... dependencies ); - - /* - * @name Copies data to another buffer. - * - * The buffers do NOT need to be from the same device, - * neither do they have to be on the same node. - * - * @param dst The source buffer. - * @param src_offset The offset on the source buffer. - * @param dst_offset The offset on the destination buffer. - * @param size The size of the area to copy. - * @return A future that can be used for synchronization - * or dependency for other calls. - * - * @see event - */ - template - send_result enqueue_send( const hpx::opencl::buffer& dst, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - Deps &&... dependencies ); - - /* - * @name Copies data to another buffer. - * - * The buffers do NOT need to be from the same device, - * neither do they have to be on the same node. - * - * @param dst The source buffer. - * @param rect_properties Parameters of the rectangle to send. - * @return A future that can be used for synchronization - * or dependency for other calls. - * - * @see event - */ - template - send_result enqueue_send_rect( const hpx::opencl::buffer& dst, - rect_props rect_properties, - Deps &&... dependencies ); - - //////////////////////////////////////////////////////////////////// - // Proxied functions - // - private: - hpx::future > - enqueue_read_impl( std::size_t && offset, - std::size_t && size, - hpx::opencl::util::resolved_events && deps ); - - send_result - enqueue_send_impl( const hpx::opencl::buffer& dst, - std::size_t && src_offset, - std::size_t && dst_offset, - std::size_t && size, - hpx::opencl::util::resolved_events && deps ); - - send_result - enqueue_send_rect_impl( const hpx::opencl::buffer& dst, - rect_props && rect_properties, - hpx::opencl::util::resolved_events && deps ); - - void ensure_device_id(); - - private: - mutable hpx::naming::id_type device_gid; - bool is_local; - - private: - // serialization support - friend class hpx::serialization::access; - - template - void load(Archive & ar, unsigned) - { - ar >> hpx::serialization::base_object(*this); - ar >> device_gid; - is_local = - (hpx::get_colocation_id(hpx::launch::sync, get_id()) == hpx::find_here()); - } - - template - void save(Archive & ar, unsigned) const - { - HPX_ASSERT(device_gid); - ar << hpx::serialization::base_object(*this); - ar << device_gid; - } - - HPX_SERIALIZATION_SPLIT_MEMBER() - - }; - -}} - +////////////////////////////////////// +/// @brief Device memory. +/// +/// Every buffer belongs to one \ref device. +/// +class HPX_OPENCL_EXPORT buffer + : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + // the result struct for enqueue_send + struct send_result { + public: + send_result(hpx::future &&fut1, hpx::future &&fut2) + : src_future(std::move(fut1)), dst_future(std::move(fut2)){}; + + hpx::future src_future; + hpx::future dst_future; + }; + + public: + // Empty constructor, necessary for hpx purposes + buffer() {} + + // Constructor + buffer(hpx::shared_future const &gid, + hpx::naming::id_type device_gid_) + : base_type(gid), device_gid(std::move(device_gid_)) { + is_local = (hpx::get_colocation_id(hpx::launch::sync, get_id()) == + hpx::find_here()); + } + + buffer(hpx::future &&gid) + : base_type(std::move(gid)), device_gid() { + is_local = (hpx::get_colocation_id(hpx::launch::sync, get_id()) == + hpx::find_here()); + } + + // initialization + + // /////////////////////////////////////////////// + // Exposed Component functionality + // + + /** + * @brief Get the size of the buffer + * + * @return The size of the buffer + */ + hpx::future size() const; + + /** + * @brief Writes data to the buffer + * + * @param offset The start position of the area to write to. + * @param data The data to be written. + * @return An future that can be used for synchronization or + * dependency for other calls. + */ + template + hpx::future enqueue_write( + std::size_t offset, const hpx::serialization::serialize_buffer data, + Deps &&...dependencies); + + /** + * @brief Writes data to the buffer in a rectangular region + * + * @param rect_properties The parameters like size, offset, stride + * @param data The data to be written. + * + * @return An future that can be used for synchronization or + * dependency for other calls. + */ + template + hpx::future enqueue_write_rect( + rect_props rect_properties, + const hpx::serialization::serialize_buffer data, + Deps &&...dependencies); + + /** + * @brief Reads data from the buffer + * + * @param offset The start position of the area to read. + * @param size The size of the area to read. + * @return A future that can be used for synchronization or + * dependency for other calls. + * Contains the result buffer of the call. + */ + template + hpx::future > enqueue_read( + std::size_t offset, std::size_t size, Deps &&...dependencies); + + /** + * @brief Reads data from the buffer + * + * @param offset The start position of the area to read. + * @param data The buffer the result will get written to. + * The buffer also contains information about the + * size of the data to read. + * The buffer will get returned and kept alive + * through the future. + * @return A future that can be used for synchronization or + * dependency for other calls. + * Contains the 'data' parameter with the result + * written to. + */ + template + hpx::future > enqueue_read( + std::size_t offset, hpx::serialization::serialize_buffer data, + Deps &&...dependencies); + + /** + * @brief Reads data from the buffer + * + * @param rect_properties Parameters of the rectangle to read. + * @param data The buffer the result will get written to. + * The buffer will get returned and kept alive + * through the future. + * @return A future that can be used for synchronization or + * dependency for other calls. + * Contains the 'data' parameter with the result + * written to. + */ + template + hpx::future > enqueue_read_rect( + rect_props rect_properties, hpx::serialization::serialize_buffer data, + Deps &&...dependencies); + + /* + * @name Copies data to another buffer. + * + * The buffers do NOT need to be from the same device, + * neither do they have to be on the same node. + * + * @param dst The source buffer. + * @param src_offset The offset on the source buffer. + * @param dst_offset The offset on the destination buffer. + * @param size The size of the area to copy. + * @return A future that can be used for synchronization + * or dependency for other calls. + * + * @see event + */ + template + send_result enqueue_send(const hpx::opencl::buffer &dst, + std::size_t src_offset, std::size_t dst_offset, + std::size_t size, Deps &&...dependencies); + + /* + * @name Copies data to another buffer. + * + * The buffers do NOT need to be from the same device, + * neither do they have to be on the same node. + * + * @param dst The source buffer. + * @param rect_properties Parameters of the rectangle to send. + * @return A future that can be used for synchronization + * or dependency for other calls. + * + * @see event + */ + template + send_result enqueue_send_rect(const hpx::opencl::buffer &dst, + rect_props rect_properties, + Deps &&...dependencies); + + //////////////////////////////////////////////////////////////////// + // Proxied functions + // + private: + hpx::future > enqueue_read_impl( + std::size_t &&offset, std::size_t &&size, + hpx::opencl::util::resolved_events &&deps); + + send_result enqueue_send_impl(const hpx::opencl::buffer &dst, + std::size_t &&src_offset, + std::size_t &&dst_offset, std::size_t &&size, + hpx::opencl::util::resolved_events &&deps); + + send_result enqueue_send_rect_impl(const hpx::opencl::buffer &dst, + rect_props &&rect_properties, + hpx::opencl::util::resolved_events &&deps); + + void ensure_device_id(); + + private: + mutable hpx::naming::id_type device_gid; + bool is_local; + + private: + // serialization support + friend class hpx::serialization::access; + + template + void load(Archive &ar, unsigned) { + ar >> hpx::serialization::base_object(*this); + ar >> device_gid; + is_local = (hpx::get_colocation_id(hpx::launch::sync, get_id()) == + hpx::find_here()); + } + + template + void save(Archive &ar, unsigned) const { + HPX_ASSERT(device_gid); + ar << hpx::serialization::base_object(*this); + ar << device_gid; + } + + HPX_SERIALIZATION_SPLIT_MEMBER() +}; + +} // namespace opencl +} // namespace hpx //////////////////////////////////////////////////////////////////////////////// // IMPLEMENTATIONS // -template +template hpx::future > -hpx::opencl::buffer::enqueue_read( std::size_t offset, - hpx::serialization::serialize_buffer data, - Deps &&... dependencies ) -{ - ensure_device_id(); - - typedef hpx::serialization::serialize_buffer buffer_type; - - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - HPX_ASSERT(deps.are_from_device(device_gid)); - - // create local event - using hpx::opencl::lcos::event; - event ev( device_gid ); - - // send command to server class - if(!is_local) { - // is remote call - - typedef hpx::opencl::server::buffer - ::enqueue_read_to_userbuffer_remote_action func_remote; - hpx::apply( std::move(get_id()), - std::move(ev.get_event_id()), - offset, - data.size() * sizeof(T), - reinterpret_cast(data.data()), - std::move(deps.event_ids) ); - - auto f = ev.get_future(); - - hpx::traits::detail::get_shared_state(f)->set_on_completed( - [data]() { /* just keep data alive */ }); - - return f; - } - - // is local call, send direct reference to buffer - typedef hpx::opencl::server::buffer - ::enqueue_read_to_userbuffer_local_action func_local; - hpx::apply( std::move(get_id()), - std::move(ev.get_event_id()), - offset, - data, - std::move(deps.event_ids) ); - - // return future connected to event - return ev.get_future(); +hpx::opencl::buffer::enqueue_read(std::size_t offset, + hpx::serialization::serialize_buffer data, + Deps &&...dependencies) { + ensure_device_id(); + + typedef hpx::serialization::serialize_buffer buffer_type; + + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + HPX_ASSERT(deps.are_from_device(device_gid)); + + // create local event + using hpx::opencl::lcos::event; + event ev(device_gid); + + // send command to server class + if (!is_local) { + // is remote call + + typedef hpx::opencl::server::buffer :: + enqueue_read_to_userbuffer_remote_action + func_remote; + hpx::apply(std::move(get_id()), std::move(ev.get_event_id()), + offset, data.size() * sizeof(T), + reinterpret_cast(data.data()), + std::move(deps.event_ids)); + + auto f = ev.get_future(); + + hpx::traits::detail::get_shared_state(f)->set_on_completed( + [data]() { /* just keep data alive */ }); + + return f; + } + + // is local call, send direct reference to buffer + typedef hpx::opencl::server::buffer ::enqueue_read_to_userbuffer_local_action< + T> + func_local; + hpx::apply(std::move(get_id()), std::move(ev.get_event_id()), + offset, data, std::move(deps.event_ids)); + + // return future connected to event + return ev.get_future(); } -template +template hpx::future > hpx::opencl::buffer::enqueue_read_rect( - rect_props rect_properties, - hpx::serialization::serialize_buffer data, - Deps &&... dependencies ) -{ - ensure_device_id(); - - typedef hpx::serialization::serialize_buffer buffer_type; - - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - HPX_ASSERT(deps.are_from_device(device_gid)); - - // create local event - using hpx::opencl::lcos::event; - event ev( device_gid ); - - // send command to server class - if(!is_local) { - // is remote call - - typedef hpx::opencl::server::buffer - ::enqueue_read_to_userbuffer_rect_remote_action func_remote; - hpx::apply( std::move(get_id()), - std::move(ev.get_event_id()), - std::move(rect_properties), - reinterpret_cast(data.data()), - std::move(deps.event_ids) ); + rect_props rect_properties, hpx::serialization::serialize_buffer data, + Deps &&...dependencies) { + ensure_device_id(); + + typedef hpx::serialization::serialize_buffer buffer_type; + + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + HPX_ASSERT(deps.are_from_device(device_gid)); + + // create local event + using hpx::opencl::lcos::event; + event ev(device_gid); + + // send command to server class + if (!is_local) { + // is remote call + + typedef hpx::opencl::server::buffer :: + enqueue_read_to_userbuffer_rect_remote_action + func_remote; + hpx::apply(std::move(get_id()), std::move(ev.get_event_id()), + std::move(rect_properties), + reinterpret_cast(data.data()), + std::move(deps.event_ids)); - auto f = ev.get_future(); + auto f = ev.get_future(); - hpx::traits::detail::get_shared_state(f)->set_on_completed( - [data]() { /* just keep data alive */ }); + hpx::traits::detail::get_shared_state(f)->set_on_completed( + [data]() { /* just keep data alive */ }); - return f; - } + return f; + } - // is local call, send direct reference to buffer + // is local call, send direct reference to buffer - typedef hpx::opencl::server::buffer - ::enqueue_read_to_userbuffer_rect_local_action func_local; - hpx::apply( std::move(get_id()), - std::move(ev.get_event_id()), - std::move(rect_properties), - data, - std::move(deps.event_ids) ); + typedef hpx::opencl::server::buffer :: + enqueue_read_to_userbuffer_rect_local_action + func_local; + hpx::apply(std::move(get_id()), std::move(ev.get_event_id()), + std::move(rect_properties), data, + std::move(deps.event_ids)); - // return future connected to event - return ev.get_future(); + // return future connected to event + return ev.get_future(); } -template -hpx::future -hpx::opencl::buffer::enqueue_write( std::size_t offset, - const hpx::serialization::serialize_buffer data, - Deps &&... dependencies ) -{ - ensure_device_id(); - - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - HPX_ASSERT(deps.are_from_device(device_gid)); - - // create local event - using hpx::opencl::lcos::event; - event ev( device_gid ); - - // send command to server class - typedef hpx::opencl::server::buffer::enqueue_write_action func; - hpx::apply( this->get_id(), - ev.get_event_id(), - offset, - data, - std::move(deps.event_ids) ); - - - // return future connected to event - return ev.get_future(); +template +hpx::future hpx::opencl::buffer::enqueue_write( + std::size_t offset, const hpx::serialization::serialize_buffer data, + Deps &&...dependencies) { + ensure_device_id(); + + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + HPX_ASSERT(deps.are_from_device(device_gid)); + + // create local event + using hpx::opencl::lcos::event; + event ev(device_gid); + + // send command to server class + typedef hpx::opencl::server::buffer::enqueue_write_action func; + hpx::apply(this->get_id(), ev.get_event_id(), offset, data, + std::move(deps.event_ids)); + + // return future connected to event + return ev.get_future(); } -template -hpx::future -hpx::opencl::buffer::enqueue_write_rect( rect_props rect_properties, - const hpx::serialization::serialize_buffer data, - Deps &&... dependencies ) -{ - ensure_device_id(); - - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - HPX_ASSERT(deps.are_from_device(device_gid)); - - // create local event - using hpx::opencl::lcos::event; - event ev( device_gid ); - - // send command to server class - typedef hpx::opencl::server::buffer::enqueue_write_rect_action func; - hpx::apply( this->get_id(), - ev.get_event_id(), - std::move(rect_properties), - data, - std::move(deps.event_ids) ); - - // return future connected to event - return ev.get_future(); +template +hpx::future hpx::opencl::buffer::enqueue_write_rect( + rect_props rect_properties, + const hpx::serialization::serialize_buffer data, + Deps &&...dependencies) { + ensure_device_id(); + + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + HPX_ASSERT(deps.are_from_device(device_gid)); + + // create local event + using hpx::opencl::lcos::event; + event ev(device_gid); + + // send command to server class + typedef hpx::opencl::server::buffer::enqueue_write_rect_action func; + hpx::apply(this->get_id(), ev.get_event_id(), + std::move(rect_properties), data, std::move(deps.event_ids)); + + // return future connected to event + return ev.get_future(); } -template +template hpx::future > -hpx::opencl::buffer::enqueue_read( std::size_t offset, - std::size_t size, - Deps &&... dependencies ) -{ - ensure_device_id(); - - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - HPX_ASSERT(deps.are_from_device(device_gid)); - - return enqueue_read_impl( std::move(offset), - std::move(size), - std::move(deps) ); +hpx::opencl::buffer::enqueue_read(std::size_t offset, std::size_t size, + Deps &&...dependencies) { + ensure_device_id(); + + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + HPX_ASSERT(deps.are_from_device(device_gid)); + + return enqueue_read_impl(std::move(offset), std::move(size), std::move(deps)); } -template -hpx::opencl::buffer::send_result -hpx::opencl::buffer::enqueue_send( const hpx::opencl::buffer& dst, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - Deps &&... dependencies ) -{ - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - - return enqueue_send_impl( dst, - std::move(src_offset), - std::move(dst_offset), - std::move(size), - std::move(deps) ); +template +hpx::opencl::buffer::send_result hpx::opencl::buffer::enqueue_send( + const hpx::opencl::buffer &dst, std::size_t src_offset, + std::size_t dst_offset, std::size_t size, Deps &&...dependencies) { + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + + return enqueue_send_impl(dst, std::move(src_offset), std::move(dst_offset), + std::move(size), std::move(deps)); } -template -hpx::opencl::buffer::send_result -hpx::opencl::buffer::enqueue_send_rect( const hpx::opencl::buffer& dst, - rect_props rect_properties, - Deps &&... dependencies ) -{ - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - - return enqueue_send_rect_impl( dst, - std::move(rect_properties), - std::move(deps) ); +template +hpx::opencl::buffer::send_result hpx::opencl::buffer::enqueue_send_rect( + const hpx::opencl::buffer &dst, rect_props rect_properties, + Deps &&...dependencies) { + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + + return enqueue_send_rect_impl(dst, std::move(rect_properties), + std::move(deps)); } #endif diff --git a/opencl/cl_headers.hpp b/opencl/cl_headers.hpp index 708a505f..598b998e 100644 --- a/opencl/cl_headers.hpp +++ b/opencl/cl_headers.hpp @@ -7,7 +7,6 @@ #ifndef HPX_OPENCL_CL_HEADERS_HPP_ #define HPX_OPENCL_CL_HEADERS_HPP_ - #if defined(__APPLE__) || defined(__MACOSX) //#include @@ -18,9 +17,6 @@ //#include #include -#endif // !__APPLE__ - - -#endif// HPX_OPENCL_CL_HEADERS_HPP_ - +#endif // !__APPLE__ +#endif // HPX_OPENCL_CL_HEADERS_HPP_ diff --git a/opencl/component_definitions.cpp b/opencl/component_definitions.cpp index f383caef..de5d65fe 100644 --- a/opencl/component_definitions.cpp +++ b/opencl/component_definitions.cpp @@ -14,7 +14,6 @@ HPX_REGISTER_COMPONENT_MODULE(); - // DEVICE typedef hpx::opencl::server::device device_type; typedef hpx::components::managed_component device_component_type; @@ -28,11 +27,11 @@ HPX_REGISTER_ACTION(device_type::create_program_with_binary_action); HPX_REGISTER_ACTION(device_type::release_event_action); HPX_REGISTER_ACTION(device_type::activate_deferred_event_action); - // BUFFER typedef hpx::opencl::server::buffer buffer_type; typedef hpx::components::managed_component buffer_component_type; -HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(buffer_component_type, hpx_opencl_buffer); +HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(buffer_component_type, + hpx_opencl_buffer); HPX_REGISTER_ACTION(buffer_type::size_action); HPX_REGISTER_ACTION(buffer_type::enqueue_read_action); @@ -40,31 +39,27 @@ HPX_REGISTER_ACTION(buffer_type::enqueue_send_action); HPX_REGISTER_ACTION(buffer_type::enqueue_send_rect_action); HPX_REGISTER_ACTION(buffer_type::get_parent_device_id_action); - // PROGRAM typedef hpx::opencl::server::program program_type; typedef hpx::components::managed_component program_component_type; -HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(program_component_type, hpx_opencl_program); +HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(program_component_type, + hpx_opencl_program); HPX_REGISTER_ACTION(program_type::get_parent_device_id_action); HPX_REGISTER_ACTION(program_type::build_action); HPX_REGISTER_ACTION(program_type::get_binary_action); HPX_REGISTER_ACTION(program_type::create_kernel_action); - // KERNEL typedef hpx::opencl::server::kernel kernel_type; typedef hpx::components::managed_component kernel_component_type; -HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(kernel_component_type, hpx_opencl_kernel); +HPX_REGISTER_MINIMAL_COMPONENT_FACTORY(kernel_component_type, + hpx_opencl_kernel); HPX_REGISTER_ACTION(kernel_type::get_parent_device_id_action); HPX_REGISTER_ACTION(kernel_type::set_arg_action); HPX_REGISTER_ACTION(kernel_type::enqueue_action); - // GLOBAL ACTIONS HPX_REGISTER_ACTION(hpx::opencl::server::create_devices_action, hpx_opencl_server_create_devices_action); - - - diff --git a/opencl/create_devices.cpp b/opencl/create_devices.cpp index f6053082..e3499a40 100644 --- a/opencl/create_devices.cpp +++ b/opencl/create_devices.cpp @@ -13,137 +13,99 @@ // HPX dependencies #include -static -hpx::lcos::future> -create_devices_on_nodes( std::vector && localities, - cl_device_type device_type, - std::string required_cl_version ) -{ - - // query all devices - std::vector>> - locality_device_futures; - for(auto &locality : localities) - { - - // get all devices on locality - hpx::lcos::future> - locality_device_future = hpx::opencl::create_devices(locality, - device_type, - required_cl_version); - - // add locality device future to list of futures - locality_device_futures.push_back(std::move(locality_device_future)); - - } - - // combine futures - hpx::lcos::future< std::vector< - hpx::lcos::future< std::vector< hpx::opencl::device > > - > > combined_locality_device_future = - hpx::when_all(locality_device_futures); - - // create result future - hpx::lcos::future< std::vector< hpx::opencl::device >> result_future = - combined_locality_device_future.then( hpx::util::bind( - - // define combining function inline - [] ( - hpx::lcos::future< std::vector< - hpx::lcos::future< std::vector< hpx::opencl::device > > - > > parent_future - ) -> std::vector< hpx::opencl::device > - { - - // initialize the result list - std::vector< hpx::opencl::device > devices; - - // get vector from parent future - std::vector< hpx::lcos::future< - std::vector< hpx::opencl::device > - > > locality_device_futures = parent_future.get(); - - // for each future, take devices out and join in one list - for(auto &locality_device_future : locality_device_futures) - { - - // wait for device query to finish - std::vector locality_devices = - locality_device_future.get(); - - // add all devices to device list - devices.insert(devices.end(), locality_devices.begin(), - locality_devices.end()); - - } - - return devices; - - }, - - hpx::util::placeholders::_1 - - )); - - // return the future to the device list - return result_future; - +static hpx::lcos::future> +create_devices_on_nodes(std::vector &&localities, + cl_device_type device_type, + std::string required_cl_version) { + // query all devices + std::vector>> + locality_device_futures; + for (auto &locality : localities) { + // get all devices on locality + hpx::lcos::future> locality_device_future = + hpx::opencl::create_devices(locality, device_type, required_cl_version); + + // add locality device future to list of futures + locality_device_futures.push_back(std::move(locality_device_future)); + } + + // combine futures + hpx::lcos::future< + std::vector>>> + combined_locality_device_future = hpx::when_all(locality_device_futures); + + // create result future + hpx::lcos::future> result_future = + combined_locality_device_future.then(hpx::util::bind( + + // define combining function inline + [](hpx::lcos::future< + std::vector>>> + parent_future) -> std::vector { + // initialize the result list + std::vector devices; + + // get vector from parent future + std::vector>> + locality_device_futures = parent_future.get(); + + // for each future, take devices out and join in one list + for (auto &locality_device_future : locality_device_futures) { + // wait for device query to finish + std::vector locality_devices = + locality_device_future.get(); + + // add all devices to device list + devices.insert(devices.end(), locality_devices.begin(), + locality_devices.end()); + } + + return devices; + }, + + hpx::util::placeholders::_1 + + )); + + // return the future to the device list + return result_future; } -hpx::lcos::future> -hpx::opencl::create_devices( hpx::naming::id_type node_id, - cl_device_type device_type, - std::string required_cl_version) -{ - - typedef hpx::opencl::server::create_devices_action action; - return async(node_id, device_type, required_cl_version); - +hpx::lcos::future> hpx::opencl::create_devices( + hpx::naming::id_type node_id, cl_device_type device_type, + std::string required_cl_version) { + typedef hpx::opencl::server::create_devices_action action; + return async(node_id, device_type, required_cl_version); } hpx::lcos::future> -hpx::opencl::create_local_devices( cl_device_type device_type, - std::string required_cl_version) -{ - - // get local locality id - hpx::naming::id_type locality = hpx::find_here(); - - // find devices on localities - return create_devices( locality, device_type, required_cl_version ); +hpx::opencl::create_local_devices(cl_device_type device_type, + std::string required_cl_version) { + // get local locality id + hpx::naming::id_type locality = hpx::find_here(); + // find devices on localities + return create_devices(locality, device_type, required_cl_version); } hpx::lcos::future> -hpx::opencl::create_remote_devices( cl_device_type device_type, - std::string required_cl_version) -{ - - // get remote HPX localities - std::vector localities = - hpx::find_remote_localities(); - - // find devices on localities - return create_devices_on_nodes( std::move(localities), - device_type, - required_cl_version ); - +hpx::opencl::create_remote_devices(cl_device_type device_type, + std::string required_cl_version) { + // get remote HPX localities + std::vector localities = hpx::find_remote_localities(); + + // find devices on localities + return create_devices_on_nodes(std::move(localities), device_type, + required_cl_version); } hpx::lcos::future> -hpx::opencl::create_all_devices( cl_device_type device_type, - std::string required_cl_version) -{ - - // get all HPX localities - std::vector localities = - hpx::find_all_localities(); - - // find devices on localities - return create_devices_on_nodes( std::move(localities), - device_type, - required_cl_version ); - +hpx::opencl::create_all_devices(cl_device_type device_type, + std::string required_cl_version) { + // get all HPX localities + std::vector localities = hpx::find_all_localities(); + + // find devices on localities + return create_devices_on_nodes(std::move(localities), device_type, + required_cl_version); } - - diff --git a/opencl/create_devices.hpp b/opencl/create_devices.hpp index 61fa6253..91cc5b6f 100644 --- a/opencl/create_devices.hpp +++ b/opencl/create_devices.hpp @@ -17,108 +17,111 @@ #include "fwd_declarations.hpp" //////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl{ - - /** - * @brief Fetches a list of accelerator devices present on target node. - * - * It is recommended to only use OpenCL Version >= 1.1. - * Earlier devices seem to be blocking on every enqueue-call, which - * is counter-productive to the general idea of the hpx framework. - * - * @param node_id The ID of the target node - * @param device_type The device type, according to OpenCL standard. - *
- * For further information, look at the official - * OpenCL Reference. - * @param required_cl_version All devices that don't support this OpenCL - * version will be ignored.
- * Version number must have the following format: - * "OpenCL ."
- * Recommended value is "OpenCL 1.1". - * @return A list of suitable OpenCL devices on target node - */ - HPX_OPENCL_EXPORT - hpx::lcos::future> - create_devices( hpx::naming::id_type node_id, cl_device_type device_type, - std::string required_cl_version ); - - /** - * @brief Fetches a list of all accelerator devices present in the current - * hpx environment. - * - * It is recommended to only use OpenCL Version >= 1.1. - * Earlier devices seem to be blocking on every enqueue-call, which - * is counter-productive to the general idea of the hpx framework. - * - * @param device_type The device type, according to OpenCL standard. - *
- * For further information, look at the official - * OpenCL Reference. - * @param required_cl_version All devices that don't support this OpenCL - * version will be ignored.
- * Version number must have the following format: - * "OpenCL ."
- * Recommended value is "OpenCL 1.1". - * @return A list of suitable OpenCL devices - */ - HPX_OPENCL_EXPORT - hpx::lcos::future> - create_all_devices( cl_device_type device_type, - std::string required_cl_version ); - - /** - * @brief Fetches a list of local accelerator devices present in the current - * hpx environment. - * - * It is recommended to only use OpenCL Version >= 1.1. - * Earlier devices seem to be blocking on every enqueue-call, which - * is counter-productive to the general idea of the hpx framework. - * - * @param device_type The device type, according to OpenCL standard. - *
- * For further information, look at the official - * OpenCL Reference. - * @param required_cl_version All devices that don't support this OpenCL - * version will be ignored.
- * Version number must have the following format: - * "OpenCL ."
- * Recommended value is "OpenCL 1.1". - * @return A list of suitable OpenCL devices - */ - HPX_OPENCL_EXPORT - hpx::lcos::future> - create_local_devices( cl_device_type device_type, - std::string required_cl_version ); - - /** - * @brief Fetches a list of remote accelerator devices present in the current - * hpx environment. - * - * It is recommended to only use OpenCL Version >= 1.1. - * Earlier devices seem to be blocking on every enqueue-call, which - * is counter-productive to the general idea of the hpx framework. - * - * @param device_type The device type, according to OpenCL standard. - *
- * For further information, look at the official - * OpenCL Reference. - * @param required_cl_version All devices that don't support this OpenCL - * version will be ignored.
- * Version number must have the following format: - * "OpenCL ."
- * Recommended value is "OpenCL 1.1". - * @return A list of suitable OpenCL devices - */ - HPX_OPENCL_EXPORT - hpx::lcos::future> - create_remote_devices( cl_device_type device_type, - std::string required_cl_version ); - - -}} - - +namespace hpx { +namespace opencl { + +/** + * @brief Fetches a list of accelerator devices present on target node. + * + * It is recommended to only use OpenCL Version >= 1.1. + * Earlier devices seem to be blocking on every enqueue-call, which + * is counter-productive to the general idea of the hpx framework. + * + * @param node_id The ID of the target node + * @param device_type The device type, according to OpenCL standard. + *
+ * For further information, look at the official + * OpenCL + * Reference. + * @param required_cl_version All devices that don't support this OpenCL + * version will be ignored.
+ * Version number must have the following format: + * "OpenCL ."
+ * Recommended value is "OpenCL 1.1". + * @return A list of suitable OpenCL devices on target node + */ +HPX_OPENCL_EXPORT +hpx::lcos::future> create_devices( + hpx::naming::id_type node_id, cl_device_type device_type, + std::string required_cl_version); + +/** + * @brief Fetches a list of all accelerator devices present in the current + * hpx environment. + * + * It is recommended to only use OpenCL Version >= 1.1. + * Earlier devices seem to be blocking on every enqueue-call, which + * is counter-productive to the general idea of the hpx framework. + * + * @param device_type The device type, according to OpenCL standard. + *
+ * For further information, look at the official + * OpenCL + * Reference. + * @param required_cl_version All devices that don't support this OpenCL + * version will be ignored.
+ * Version number must have the following format: + * "OpenCL ."
+ * Recommended value is "OpenCL 1.1". + * @return A list of suitable OpenCL devices + */ +HPX_OPENCL_EXPORT +hpx::lcos::future> create_all_devices( + cl_device_type device_type, std::string required_cl_version); + +/** + * @brief Fetches a list of local accelerator devices present in the current + * hpx environment. + * + * It is recommended to only use OpenCL Version >= 1.1. + * Earlier devices seem to be blocking on every enqueue-call, which + * is counter-productive to the general idea of the hpx framework. + * + * @param device_type The device type, according to OpenCL standard. + *
+ * For further information, look at the official + * OpenCL + * Reference. + * @param required_cl_version All devices that don't support this OpenCL + * version will be ignored.
+ * Version number must have the following format: + * "OpenCL ."
+ * Recommended value is "OpenCL 1.1". + * @return A list of suitable OpenCL devices + */ +HPX_OPENCL_EXPORT +hpx::lcos::future> create_local_devices( + cl_device_type device_type, std::string required_cl_version); + +/** + * @brief Fetches a list of remote accelerator devices present in the current + * hpx environment. + * + * It is recommended to only use OpenCL Version >= 1.1. + * Earlier devices seem to be blocking on every enqueue-call, which + * is counter-productive to the general idea of the hpx framework. + * + * @param device_type The device type, according to OpenCL standard. + *
+ * For further information, look at the official + * OpenCL + * Reference. + * @param required_cl_version All devices that don't support this OpenCL + * version will be ignored.
+ * Version number must have the following format: + * "OpenCL ."
+ * Recommended value is "OpenCL 1.1". + * @return A list of suitable OpenCL devices + */ +HPX_OPENCL_EXPORT +hpx::lcos::future> create_remote_devices( + cl_device_type device_type, std::string required_cl_version); + +} // namespace opencl +} // namespace hpx #endif - diff --git a/opencl/detail/info_type.hpp b/opencl/detail/info_type.hpp index 9eb01b19..4083c0a3 100644 --- a/opencl/detail/info_type.hpp +++ b/opencl/detail/info_type.hpp @@ -9,120 +9,139 @@ #include "../cl_headers.hpp" - #include -namespace hpx { namespace opencl { namespace detail { - - template - struct device_info - { - }; +namespace hpx { +namespace opencl { +namespace detail { + +template +struct device_info {}; - template - struct platform_info - { - }; +template +struct platform_info {}; - #define HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(name, ret_type) \ - template<> struct device_info \ - { \ - public: \ - typedef ret_type type; \ - }; +#define HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(name, ret_type) \ + template <> \ + struct device_info { \ + public: \ + typedef ret_type type; \ + }; - #define HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(name, ret_type) \ - template<> struct platform_info \ - { \ - public: \ - typedef ret_type type; \ - }; +#define HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(name, ret_type) \ + template <> \ + struct platform_info { \ + public: \ + typedef ret_type type; \ + }; + +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_ADDRESS_BITS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_AVAILABLE, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_COMPILER_AVAILABLE, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_DOUBLE_FP_CONFIG, + cl_device_fp_config) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_ENDIAN_LITTLE, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_EXECUTION_CAPABILITIES, + cl_device_exec_capabilities) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_EXTENSIONS, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, + cl_device_mem_cache_type) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_HALF_FP_CONFIG, + cl_device_fp_config) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE_SUPPORT, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE2D_MAX_HEIGHT, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE2D_MAX_WIDTH, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE3D_MAX_DEPTH, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE3D_MAX_HEIGHT, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE3D_MAX_WIDTH, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_LOCAL_MEM_TYPE, + cl_device_local_mem_type) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_PARAMETER_SIZE, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_SAMPLERS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_WORK_GROUP_SIZE, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_WORK_ITEM_SIZES, + std::vector) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NAME, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_OPENCL_C_VERSION, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PLATFORM, cl_platform_id) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, + cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PROFILE, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PROFILING_TIMER_RESOLUTION, + std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_QUEUE_PROPERTIES, + cl_command_queue_properties) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_SINGLE_FP_CONFIG, + cl_device_fp_config) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_TYPE, cl_device_type) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_VENDOR, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_VENDOR_ID, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_VERSION, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DRIVER_VERSION, std::string) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_ADDRESS_BITS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_AVAILABLE, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_COMPILER_AVAILABLE, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_DOUBLE_FP_CONFIG, cl_device_fp_config ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_ENDIAN_LITTLE, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_ERROR_CORRECTION_SUPPORT, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_EXECUTION_CAPABILITIES, cl_device_exec_capabilities ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_EXTENSIONS, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, cl_ulong ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, cl_device_mem_cache_type ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_GLOBAL_MEM_SIZE, cl_ulong ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_HALF_FP_CONFIG, cl_device_fp_config ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_HOST_UNIFIED_MEMORY, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE_SUPPORT, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE2D_MAX_HEIGHT, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE2D_MAX_WIDTH, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE3D_MAX_DEPTH, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE3D_MAX_HEIGHT, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE3D_MAX_WIDTH, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_LOCAL_MEM_SIZE, cl_ulong ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_LOCAL_MEM_TYPE, cl_device_local_mem_type ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_CLOCK_FREQUENCY, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_COMPUTE_UNITS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_CONSTANT_ARGS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, cl_ulong ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_MEM_ALLOC_SIZE, cl_ulong ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_PARAMETER_SIZE, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_READ_IMAGE_ARGS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_SAMPLERS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_WORK_GROUP_SIZE, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_WORK_ITEM_SIZES, std::vector ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MAX_WRITE_IMAGE_ARGS, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MEM_BASE_ADDR_ALIGN, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_MIN_DATA_TYPE_ALIGN_SIZE, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NAME, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_CHAR, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_SHORT, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_INT, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_LONG, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_FLOAT, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_DOUBLE, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_NATIVE_VECTOR_WIDTH_HALF, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_OPENCL_C_VERSION, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PLATFORM, cl_platform_id ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_SHORT, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_LONG, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_VECTOR_WIDTH_HALF, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PROFILE, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PROFILING_TIMER_RESOLUTION, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_QUEUE_PROPERTIES, cl_command_queue_properties ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_SINGLE_FP_CONFIG, cl_device_fp_config ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_TYPE, cl_device_type ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_VENDOR, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_VENDOR_ID, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_VERSION, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DRIVER_VERSION, std::string ) - #ifdef CL_VERSION_1_2 - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_BUILT_IN_KERNELS, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_LINKER_AVAILABLE, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PARENT_DEVICE, cl_device_id ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PARTITION_PROPERTIES, std::vector ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PARTITION_AFFINITY_DOMAIN, cl_device_affinity_domain ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PARTITION_TYPE, std::vector ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PRINTF_BUFFER_SIZE, std::size_t ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, cl_bool ) - HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE( CL_DEVICE_REFERENCE_COUNT, cl_uint ) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_BUILT_IN_KERNELS, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE_MAX_BUFFER_SIZE, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_IMAGE_MAX_ARRAY_SIZE, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_LINKER_AVAILABLE, cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PARENT_DEVICE, cl_device_id) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PARTITION_MAX_SUB_DEVICES, cl_uint) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PARTITION_PROPERTIES, + std::vector) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PARTITION_AFFINITY_DOMAIN, + cl_device_affinity_domain) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PARTITION_TYPE, + std::vector) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PRINTF_BUFFER_SIZE, std::size_t) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_PREFERRED_INTEROP_USER_SYNC, + cl_bool) +HPX_OPENCL_DETAIL_INFO_TYPE_DEVICE(CL_DEVICE_REFERENCE_COUNT, cl_uint) #endif - HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM( CL_PLATFORM_PROFILE, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM( CL_PLATFORM_VERSION, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM( CL_PLATFORM_NAME, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM( CL_PLATFORM_VENDOR, std::string ) - HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM( CL_PLATFORM_EXTENSIONS, std::string ) - -}}} +HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(CL_PLATFORM_PROFILE, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(CL_PLATFORM_VERSION, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(CL_PLATFORM_NAME, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(CL_PLATFORM_VENDOR, std::string) +HPX_OPENCL_DETAIL_INFO_TYPE_PLATFORM(CL_PLATFORM_EXTENSIONS, std::string) -#endif //HPX_OPENCL_DETAIL_INFO_TYPE_HPP_ +} // namespace detail +} // namespace opencl +} // namespace hpx +#endif // HPX_OPENCL_DETAIL_INFO_TYPE_HPP_ diff --git a/opencl/device.cpp b/opencl/device.cpp index 4c105a9d..96920ded 100644 --- a/opencl/device.cpp +++ b/opencl/device.cpp @@ -16,78 +16,58 @@ using hpx::opencl::device; -hpx::opencl::util::generic_buffer -device::get_device_info_raw(cl_device_info info_type) const -{ +hpx::opencl::util::generic_buffer device::get_device_info_raw( + cl_device_info info_type) const { + HPX_ASSERT(this->get_id()); - HPX_ASSERT(this->get_id()); - - typedef hpx::opencl::server::device::get_device_info_action func; - - return hpx::opencl::util::generic_buffer( - hpx::async(this->get_id(), info_type)); + typedef hpx::opencl::server::device::get_device_info_action func; + return hpx::opencl::util::generic_buffer( + hpx::async(this->get_id(), info_type)); } +hpx::opencl::util::generic_buffer device::get_platform_info_raw( + cl_platform_info info_type) const { + HPX_ASSERT(this->get_id()); -hpx::opencl::util::generic_buffer -device::get_platform_info_raw(cl_platform_info info_type) const -{ - - HPX_ASSERT(this->get_id()); - - typedef hpx::opencl::server::device::get_platform_info_action func; - - return hpx::opencl::util::generic_buffer( - hpx::async(this->get_id(), info_type)); + typedef hpx::opencl::server::device::get_platform_info_action func; + return hpx::opencl::util::generic_buffer( + hpx::async(this->get_id(), info_type)); } +hpx::opencl::buffer device::create_buffer(cl_mem_flags flags, + std::size_t size) const { + HPX_ASSERT(this->get_id()); -hpx::opencl::buffer -device::create_buffer(cl_mem_flags flags, std::size_t size) const -{ + typedef hpx::opencl::server::device::create_buffer_action func; - HPX_ASSERT(this->get_id()); - - typedef hpx::opencl::server::device::create_buffer_action func; - - hpx::future buffer_server = - hpx::async(this->get_id(), flags, size); - - return buffer(std::move(buffer_server), this->get_id()); + hpx::future buffer_server = + hpx::async(this->get_id(), flags, size); + return buffer(std::move(buffer_server), this->get_id()); } -hpx::opencl::program -device::create_program_with_source( - hpx::serialization::serialize_buffer src ) const -{ - - HPX_ASSERT(this->get_id()); +hpx::opencl::program device::create_program_with_source( + hpx::serialization::serialize_buffer src) const { + HPX_ASSERT(this->get_id()); - typedef hpx::opencl::server::device::create_program_with_source_action func; - - hpx::future program_server = - hpx::async(this->get_id(), src); + typedef hpx::opencl::server::device::create_program_with_source_action func; - return program(std::move(program_server), this->get_id()); + hpx::future program_server = + hpx::async(this->get_id(), src); + return program(std::move(program_server), this->get_id()); } -hpx::opencl::program -device::create_program_with_binary( - hpx::serialization::serialize_buffer binary ) const -{ +hpx::opencl::program device::create_program_with_binary( + hpx::serialization::serialize_buffer binary) const { + HPX_ASSERT(this->get_id()); - HPX_ASSERT(this->get_id()); + typedef hpx::opencl::server::device::create_program_with_binary_action func; - typedef hpx::opencl::server::device::create_program_with_binary_action func; - - hpx::future program_server = - hpx::async(this->get_id(), binary); - - return program(std::move(program_server), this->get_id()); + hpx::future program_server = + hpx::async(this->get_id(), binary); + return program(std::move(program_server), this->get_id()); } - diff --git a/opencl/device.hpp b/opencl/device.hpp index 8e04667c..3abcd9f5 100644 --- a/opencl/device.hpp +++ b/opencl/device.hpp @@ -27,156 +27,147 @@ namespace hpx { namespace opencl { - ///////////////////////////////////////// - /// @brief An OpenCL accelerator device. - /// - class HPX_OPENCL_EXPORT device - : public hpx::components::client_base - { - - typedef hpx::components::client_base base_type; - - public: - device(){} - - device(hpx::shared_future const& gid) - : base_type(gid) - {} - - device(hpx::future && gid) - : base_type(std::move(gid)) - {} - - ////////////////////////////////////////// - // Exposed Component functionality - // - - /** - * @brief Creates an OpenCL buffer. - * - * @param flags Sets properties of the buffer.
- * Possible values are - * - CL_MEM_READ_WRITE - * - CL_MEM_WRITE_ONLY - * - CL_MEM_READ_ONLY - * - CL_MEM_HOST_WRITE_ONLY - * - CL_MEM_HOST_READ_ONLY - * - CL_MEM_HOST_NO_ACCESS - * . - * and combinations of them.
- * For further information, read the official - * OpenCL Reference. - * @param size The size of the buffer, in bytes. - * @return A new \ref buffer object. - * @see buffer - */ - hpx::opencl::buffer - create_buffer(cl_mem_flags flags, std::size_t size) const; - - /** - * @brief Creates an OpenCL program object - * - * After creating a program object, one usually compiles the - * program an creates kernels from it. - * - * One program can contain code for multiple kernels. - * - * @param source The source code string for the program. - * @return A program object associated with the calling - * device. - */ - hpx::opencl::program - create_program_with_source( - const hpx::serialization::serialize_buffer source) const; - - /** - * @brief Creates an OpenCL program object from a prebuilt binary - * - * One can create a prebuilt binary from a compiled - * \ref hpx::opencl::program with \ref program::get_binary() - * - * @param binary The binary execution code for the program. - * @return A program object associated with the calling - * device. - */ - hpx::opencl::program - create_program_with_binary( - const hpx::serialization::serialize_buffer binary) const; - - /** - * @brief Queries device infos. - * - * The template argument defines the type of information. - * A complete list can be found on the official - * OpenCL Reference. - * @return The requested information. - */ - template - hpx::future::type> - get_device_info() const { - - hpx::opencl::util::generic_buffer data = - get_device_info_raw(Name); - - return data.get::type>(); - - } - - /** - * @brief Queries platform infos. - * - * The template argument defines the type of information. - * A complete list can be found on the official - * OpenCL Reference. - * @return The requested information. - */ - template - hpx::future::type> - get_platform_info() const { - - hpx::opencl::util::generic_buffer data = - get_platform_info_raw(Name); - - return data.get::type>(); - - } - - private: - - ////////////////////////////////////////// - // Internal Component functionality - // - - /** - * @brief Queries device infos. - * - * @param info_type The type of information.
- * A complete list can be found on the official - * OpenCL Reference. - * @return The info data as \ref hpx::opencl::info.
- * It can be cast to several datatypes. - */ - hpx::opencl::util::generic_buffer - get_device_info_raw(cl_device_info info_type) const; - - /** - * @brief Queries platform infos. - * - * @param info_type The type of information.
- * A complete list can be found on the official - * OpenCL Reference. - * @return The info data as \ref hpx::opencl::info.
- * It can be cast to several datatypes. - */ - hpx::opencl::util::generic_buffer - get_platform_info_raw(cl_platform_info info_type) const; - - - }; - -}} - - -#endif// HPX_OPENCL_DEVICE_HPP_ - - +///////////////////////////////////////// +/// @brief An OpenCL accelerator device. +/// +class HPX_OPENCL_EXPORT device + : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + device() {} + + device(hpx::shared_future const& gid) + : base_type(gid) {} + + device(hpx::future&& gid) : base_type(std::move(gid)) {} + + ////////////////////////////////////////// + // Exposed Component functionality + // + + /** + * @brief Creates an OpenCL buffer. + * + * @param flags Sets properties of the buffer.
+ * Possible values are + * - CL_MEM_READ_WRITE + * - CL_MEM_WRITE_ONLY + * - CL_MEM_READ_ONLY + * - CL_MEM_HOST_WRITE_ONLY + * - CL_MEM_HOST_READ_ONLY + * - CL_MEM_HOST_NO_ACCESS + * . + * and combinations of them.
+ * For further information, read the official + * OpenCL + * Reference. + * @param size The size of the buffer, in bytes. + * @return A new \ref buffer object. + * @see buffer + */ + hpx::opencl::buffer create_buffer(cl_mem_flags flags, std::size_t size) const; + + /** + * @brief Creates an OpenCL program object + * + * After creating a program object, one usually compiles the + * program an creates kernels from it. + * + * One program can contain code for multiple kernels. + * + * @param source The source code string for the program. + * @return A program object associated with the calling + * device. + */ + hpx::opencl::program create_program_with_source( + const hpx::serialization::serialize_buffer source) const; + + /** + * @brief Creates an OpenCL program object from a prebuilt binary + * + * One can create a prebuilt binary from a compiled + * \ref hpx::opencl::program with \ref program::get_binary() + * + * @param binary The binary execution code for the program. + * @return A program object associated with the calling + * device. + */ + hpx::opencl::program create_program_with_binary( + const hpx::serialization::serialize_buffer binary) const; + + /** + * @brief Queries device infos. + * + * The template argument defines the type of information. + * A complete list can be found on the official + * OpenCL + * Reference. + * @return The requested information. + */ + template + hpx::future::type> get_device_info() + const { + hpx::opencl::util::generic_buffer data = get_device_info_raw(Name); + + return data.get::type>(); + } + + /** + * @brief Queries platform infos. + * + * The template argument defines the type of information. + * A complete list can be found on the official + * OpenCL + * Reference. + * @return The requested information. + */ + template + hpx::future::type> get_platform_info() + const { + hpx::opencl::util::generic_buffer data = get_platform_info_raw(Name); + + return data.get::type>(); + } + + private: + ////////////////////////////////////////// + // Internal Component functionality + // + + /** + * @brief Queries device infos. + * + * @param info_type The type of information.
+ * A complete list can be found on the official + * OpenCL + * Reference. + * @return The info data as \ref hpx::opencl::info.
+ * It can be cast to several datatypes. + */ + hpx::opencl::util::generic_buffer get_device_info_raw( + cl_device_info info_type) const; + + /** + * @brief Queries platform infos. + * + * @param info_type The type of information.
+ * A complete list can be found on the official + * OpenCL + * Reference. + * @return The info data as \ref hpx::opencl::info.
+ * It can be cast to several datatypes. + */ + hpx::opencl::util::generic_buffer get_platform_info_raw( + cl_platform_info info_type) const; +}; + +} // namespace opencl +} // namespace hpx + +#endif // HPX_OPENCL_DEVICE_HPP_ diff --git a/opencl/export_definitions.hpp b/opencl/export_definitions.hpp index 938999fc..88fda43f 100644 --- a/opencl/export_definitions.hpp +++ b/opencl/export_definitions.hpp @@ -10,12 +10,10 @@ #include #include - #if defined(HPX_OPENCL_MODULE_EXPORTS) -# define HPX_OPENCL_EXPORT HPX_SYMBOL_EXPORT +#define HPX_OPENCL_EXPORT HPX_SYMBOL_EXPORT #else -# define HPX_OPENCL_EXPORT HPX_SYMBOL_IMPORT +#define HPX_OPENCL_EXPORT HPX_SYMBOL_IMPORT #endif - -#endif //HPX_OPENCL_EXPORT_DEFINITIONS_HPP_ +#endif // HPX_OPENCL_EXPORT_DEFINITIONS_HPP_ diff --git a/opencl/fwd_declarations.hpp b/opencl/fwd_declarations.hpp index 93c10e63..9059c96f 100644 --- a/opencl/fwd_declarations.hpp +++ b/opencl/fwd_declarations.hpp @@ -7,7 +7,6 @@ #ifndef HPX_OPENCL_FWD_DECLARATIONS_HPP_ #define HPX_OPENCL_FWD_DECLARATIONS_HPP_ - // This file forward-declares all hpxcl classes. // This is important to remove circular dependencies and improve compile speed. @@ -18,23 +17,22 @@ namespace hpx { /// The OpenCL client namespace namespace opencl { - class device; - class buffer; - class program; - class kernel; +class device; +class buffer; +class program; +class kernel; - // The OpenCL server namespace - namespace server { +// The OpenCL server namespace +namespace server { - class HPX_OPENCL_EXPORT device; - class HPX_OPENCL_EXPORT buffer; - class HPX_OPENCL_EXPORT program; - class HPX_OPENCL_EXPORT kernel; +class HPX_OPENCL_EXPORT device; +class HPX_OPENCL_EXPORT buffer; +class HPX_OPENCL_EXPORT program; +class HPX_OPENCL_EXPORT kernel; - } +} // namespace server -}} +} // namespace opencl +} // namespace hpx #endif - - diff --git a/opencl/kernel.cpp b/opencl/kernel.cpp index 7cea6a07..8d00b898 100644 --- a/opencl/kernel.cpp +++ b/opencl/kernel.cpp @@ -12,53 +12,40 @@ using hpx::opencl::kernel; -void kernel::ensure_device_id() const -{ - if (!device_gid) - { - typedef - hpx::opencl::server::kernel::get_parent_device_id_action - action_type; - HPX_ASSERT(this->get_id()); - device_gid = async(this->get_id()).get(); - } +void kernel::ensure_device_id() const { + if (!device_gid) { + typedef hpx::opencl::server::kernel::get_parent_device_id_action + action_type; + HPX_ASSERT(this->get_id()); + device_gid = async(this->get_id()).get(); + } } -void -kernel::set_arg(cl_uint arg_index, const hpx::opencl::buffer &arg) const -{ - set_arg_async(arg_index, arg).get(); +void kernel::set_arg(cl_uint arg_index, const hpx::opencl::buffer &arg) const { + set_arg_async(arg_index, arg).get(); } -hpx::lcos::future -kernel::set_arg_async(cl_uint arg_index, const hpx::opencl::buffer &arg) const -{ - - HPX_ASSERT(this->get_id()); - - typedef hpx::opencl::server::kernel::set_arg_action func; +hpx::lcos::future kernel::set_arg_async( + cl_uint arg_index, const hpx::opencl::buffer &arg) const { + HPX_ASSERT(this->get_id()); - return hpx::async(this->get_id(), arg_index, arg.get_id()); + typedef hpx::opencl::server::kernel::set_arg_action func; + return hpx::async(this->get_id(), arg_index, arg.get_id()); } -hpx::future -kernel::enqueue_impl( std::vector && size_vec, - hpx::opencl::util::resolved_events && deps ) const -{ - - // create local event - using hpx::opencl::lcos::event; - event ev( device_gid ); - - // send command to server class - typedef hpx::opencl::server::kernel::enqueue_action func; - hpx::apply( this->get_id(), - std::move(ev.get_event_id()), - size_vec, - std::move(deps.event_ids) ); +hpx::future kernel::enqueue_impl( + std::vector &&size_vec, + hpx::opencl::util::resolved_events &&deps) const { + // create local event + using hpx::opencl::lcos::event; + event ev(device_gid); - // return future connected to event - return ev.get_future(); + // send command to server class + typedef hpx::opencl::server::kernel::enqueue_action func; + hpx::apply(this->get_id(), std::move(ev.get_event_id()), size_vec, + std::move(deps.event_ids)); + // return future connected to event + return ev.get_future(); } diff --git a/opencl/kernel.hpp b/opencl/kernel.hpp index 44dbc15f..14f290b7 100644 --- a/opencl/kernel.hpp +++ b/opencl/kernel.hpp @@ -23,180 +23,167 @@ // Crazy function overloading #include "util/enqueue_overloads.hpp" - namespace hpx { namespace opencl { - //////////////////////// - /// @brief Kernel execution dimensions. - /// - /// This structure offers an alternative way to set and reuse kernel - /// execution dimensions. - /// - /// Example: - /// \code{.cpp} - /// // Create work_size object - /// hpx::opencl::work_size<1> dim; - /// - /// // Set dimensions. - /// dim[0].offset = 0; - /// dim[0].size = 2048; - /// - /// // Set local work size. - /// // This can be left out. - /// // OpenCL will then automatically determine the best local work size. - /// dim[0].local_size = 64; - /// - /// // Enqueue a kernel using the work_size object - /// event kernel_event = kernel.enqueue(dim).get(); - /// - /// \endcode - /// - template - struct work_size - { - private: - struct dimension - { - std::size_t offset; - std::size_t size; - std::size_t local_size; - dimension(){ - offset = 0; - size = 0; - local_size = 0; - } - }; - private: - // local_size be treated as NULL if all dimensions have local_size == 0 - dimension dims[DIM]; - public: - dimension& operator[](std::size_t idx){ return dims[idx]; } - }; - - ////////////////////////////////////// - /// @brief An OpenCL kernel. - /// - /// Every kernel belongs to one \ref device. - /// - class HPX_OPENCL_EXPORT kernel - : public hpx::components::client_base - { - - typedef hpx::components::client_base base_type; - - public: - // Empty constructor, necessary for hpx purposes - kernel(){} - - // Constructor - kernel(hpx::shared_future const& gid, - hpx::naming::id_type device_gid_) - : base_type(gid), device_gid(std::move(device_gid_)) - {} - - kernel(hpx::future && gid) - : base_type(std::move(gid)), device_gid() - {} - - // initialization - - - // /////////////////////////////////////////////// - // Exposed Component functionality - // - - /** - * @brief Sets a kernel argument - * - * This is the non-blocking version of set_arg - * - * @param arg_index The argument index to which the buffer will - * be connected. - * @param arg The \ref buffer that will be connected. - * @return A future that will trigger upon completion. - */ - hpx::lcos::future - set_arg_async(cl_uint arg_index, const hpx::opencl::buffer &arg) const; - - /** - * @brief Sets a kernel argument - * - * @param arg_index The argument index to which the buffer will - * be connected. - * @param arg The \ref buffer that will be connected. - * @return A future that will trigger upon completion. - */ - void - set_arg(cl_uint arg_index, const hpx::opencl::buffer &arg) const; - - /** - * @name Starts execution of a kernel, using work_size as work - * dimensions. - * - * @param size The work dimensions on which the kernel should - * get executed on. - * @return An \ref event that triggers upon completion. - */ - template - hpx::lcos::future - enqueue( hpx::opencl::work_size size, - Deps &&... dependencies ) const; - - hpx::lcos::future - enqueue_impl( std::vector && size_vec, - hpx::opencl::util::resolved_events && deps ) const; - - - protected: - void ensure_device_id() const; - - private: - mutable hpx::naming::id_type device_gid; - - private: - // serialization support - friend class hpx::serialization::access; - - template - void serialize(Archive & ar, unsigned) - { - HPX_ASSERT(device_gid); - ar & hpx::serialization::base_object(*this); - ar & device_gid; - } - - }; - -}} - +//////////////////////// +/// @brief Kernel execution dimensions. +/// +/// This structure offers an alternative way to set and reuse kernel +/// execution dimensions. +/// +/// Example: +/// \code{.cpp} +/// // Create work_size object +/// hpx::opencl::work_size<1> dim; +/// +/// // Set dimensions. +/// dim[0].offset = 0; +/// dim[0].size = 2048; +/// +/// // Set local work size. +/// // This can be left out. +/// // OpenCL will then automatically determine the best local work size. +/// dim[0].local_size = 64; +/// +/// // Enqueue a kernel using the work_size object +/// event kernel_event = kernel.enqueue(dim).get(); +/// +/// \endcode +/// +template +struct work_size { + private: + struct dimension { + std::size_t offset; + std::size_t size; + std::size_t local_size; + dimension() { + offset = 0; + size = 0; + local_size = 0; + } + }; + + private: + // local_size be treated as NULL if all dimensions have local_size == 0 + dimension dims[DIM]; + + public: + dimension &operator[](std::size_t idx) { return dims[idx]; } +}; + +////////////////////////////////////// +/// @brief An OpenCL kernel. +/// +/// Every kernel belongs to one \ref device. +/// +class HPX_OPENCL_EXPORT kernel + : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + // Empty constructor, necessary for hpx purposes + kernel() {} + + // Constructor + kernel(hpx::shared_future const &gid, + hpx::naming::id_type device_gid_) + : base_type(gid), device_gid(std::move(device_gid_)) {} + + kernel(hpx::future &&gid) + : base_type(std::move(gid)), device_gid() {} + + // initialization + + // /////////////////////////////////////////////// + // Exposed Component functionality + // + + /** + * @brief Sets a kernel argument + * + * This is the non-blocking version of set_arg + * + * @param arg_index The argument index to which the buffer will + * be connected. + * @param arg The \ref buffer that will be connected. + * @return A future that will trigger upon completion. + */ + hpx::lcos::future set_arg_async(cl_uint arg_index, + const hpx::opencl::buffer &arg) const; + + /** + * @brief Sets a kernel argument + * + * @param arg_index The argument index to which the buffer will + * be connected. + * @param arg The \ref buffer that will be connected. + * @return A future that will trigger upon completion. + */ + void set_arg(cl_uint arg_index, const hpx::opencl::buffer &arg) const; + + /** + * @name Starts execution of a kernel, using work_size as work + * dimensions. + * + * @param size The work dimensions on which the kernel should + * get executed on. + * @return An \ref event that triggers upon completion. + */ + template + hpx::lcos::future enqueue(hpx::opencl::work_size size, + Deps &&...dependencies) const; + + hpx::lcos::future enqueue_impl( + std::vector &&size_vec, + hpx::opencl::util::resolved_events &&deps) const; + + protected: + void ensure_device_id() const; + + private: + mutable hpx::naming::id_type device_gid; + + private: + // serialization support + friend class hpx::serialization::access; + + template + void serialize(Archive &ar, unsigned) { + HPX_ASSERT(device_gid); + ar &hpx::serialization::base_object(*this); + ar &device_gid; + } +}; + +} // namespace opencl +} // namespace hpx //////////////////////////////////////////////////////////////////////////////// // IMPLEMENTATIONS // -template -hpx::future -hpx::opencl::kernel::enqueue( hpx::opencl::work_size size, - Deps &&... dependencies ) const -{ - ensure_device_id(); - - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_gid.get_gid(),std::forward(dependencies)...); - HPX_ASSERT(deps.are_from_device(device_gid)); - - // extract information from work_size struct - std::vector size_vec(3*DIM); - for(std::size_t i = 0; i < DIM; i++){ - size_vec[i + 0*DIM] = size[i].offset; - size_vec[i + 1*DIM] = size[i].size; - size_vec[i + 2*DIM] = size[i].local_size; - } - - // forward to enqueue_impl - return enqueue_impl( std::move(size_vec), std::move(deps) ); - +template +hpx::future hpx::opencl::kernel::enqueue(hpx::opencl::work_size size, + Deps &&...dependencies) const { + ensure_device_id(); + + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = + resolver(device_gid.get_gid(), std::forward(dependencies)...); + HPX_ASSERT(deps.are_from_device(device_gid)); + + // extract information from work_size struct + std::vector size_vec(3 * DIM); + for (std::size_t i = 0; i < DIM; i++) { + size_vec[i + 0 * DIM] = size[i].offset; + size_vec[i + 1 * DIM] = size[i].size; + size_vec[i + 2 * DIM] = size[i].local_size; + } + + // forward to enqueue_impl + return enqueue_impl(std::move(size_vec), std::move(deps)); } #endif diff --git a/opencl/lcos/event.cpp b/opencl/lcos/event.cpp index c631e5fc..b1cfa45a 100644 --- a/opencl/lcos/event.cpp +++ b/opencl/lcos/event.cpp @@ -7,25 +7,21 @@ #include "../server/device.hpp" -void -hpx::opencl::lcos::detail::unregister_event( hpx::naming::id_type device_id, - hpx::naming::gid_type event_gid ) -{ - HPX_ASSERT(device_id && event_gid); +void hpx::opencl::lcos::detail::unregister_event( + hpx::naming::id_type device_id, hpx::naming::gid_type event_gid) { + HPX_ASSERT(device_id && event_gid); - typedef hpx::opencl::server::device::release_event_action func; - hpx::apply( device_id, event_gid ); + typedef hpx::opencl::server::device::release_event_action func; + hpx::apply(device_id, event_gid); } -//template<> -void hpx::opencl::lcos::detail::event_data::arm() -{ - HPX_ASSERT(device_id && event_id); +// template<> +void hpx::opencl::lcos::detail::event_data::arm() { + HPX_ASSERT(device_id && event_id); - // Tell the device server that we'd like to be informed when the cl_event - // is completed - typedef hpx::opencl::server::device::activate_deferred_event_action func; - hpx::apply(device_id, event_id); + // Tell the device server that we'd like to be informed when the cl_event + // is completed + typedef hpx::opencl::server::device::activate_deferred_event_action func; + hpx::apply(device_id, event_id); } - - diff --git a/opencl/lcos/event.hpp b/opencl/lcos/event.hpp index c9cce8b7..bdc8d66c 100644 --- a/opencl/lcos/event.hpp +++ b/opencl/lcos/event.hpp @@ -18,38 +18,44 @@ #include -namespace hpx { namespace opencl { namespace lcos -{ - template ::type> - class event; -}}} +namespace hpx { +namespace opencl { +namespace lcos { +template ::type> +class event; +} +} // namespace opencl +} // namespace hpx /////////////////////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl { namespace lcos { namespace detail -{ - /////////////////////////////////////////////////////////////////////////// - HPX_OPENCL_EXPORT void unregister_event( hpx::naming::id_type device_id, - hpx::naming::gid_type event_gid ); - - /////////////////////////////////////////////////////////////////////////// - // Zero-copy-Data Function - // - - // This function is here for zero-copy of read_to_userbuffer_remote - // Receive a zerocopy_buffer as result of the event. - // De-serialization of the zerocopy_buffer automatically writes - // the data to result_buffer (zerocopy_buffer knows the address of the - // result_buffer's data() member) - // Then set result_buffer as data of this event. +namespace hpx { +namespace opencl { +namespace lcos { +namespace detail { +/////////////////////////////////////////////////////////////////////////// +HPX_OPENCL_EXPORT void unregister_event(hpx::naming::id_type device_id, + hpx::naming::gid_type event_gid); + +/////////////////////////////////////////////////////////////////////////// +// Zero-copy-Data Function +// + +// This function is here for zero-copy of read_to_userbuffer_remote +// Receive a zerocopy_buffer as result of the event. +// De-serialization of the zerocopy_buffer automatically writes +// the data to result_buffer (zerocopy_buffer knows the address of the +// result_buffer's data() member) +// Then set result_buffer as data of this event. // template // void set_zerocopy_data( hpx::naming::id_type event_id, // hpx::opencl::lcos::zerocopy_buffer buf ) // { // typedef hpx::serialization::serialize_buffer buffer_type; // typedef hpx::lcos::base_lco_with_value lco_type; -// typedef typename hpx::opencl::lcos::event::shared_state_type +// typedef typename +// hpx::opencl::lcos::event::shared_state_type // event_ptr; // // // Resolve address of lco @@ -73,278 +79,237 @@ namespace hpx { namespace opencl { namespace lcos { namespace detail // template // struct set_zerocopy_data_action // : hpx::actions::make_direct_action< -// void (*)( hpx::naming::id_type, hpx::opencl::lcos::zerocopy_buffer), -// &set_zerocopy_data, +// void (*)( hpx::naming::id_type, +// hpx::opencl::lcos::zerocopy_buffer), &set_zerocopy_data, // set_zerocopy_data_action // > // {}; - /////////////////////////////////////////////////////////////////////////// - template - class event_data - : public hpx::lcos::detail::future_data - { - private: - typedef hpx::lcos::detail::future_data parent_type; - typedef typename parent_type::result_type result_type; - - public: - typedef typename parent_type::init_no_addref init_no_addref; - - event_data() {} - - event_data(init_no_addref no_addref) - : parent_type(no_addref) - { - } - - ~event_data() - { - HPX_ASSERT(device_id && event_id); - unregister_event( device_id, event_id.get_gid() ); - } - - void init(hpx::naming::id_type && device_id_) - { - device_id = std::move(device_id_); - } - - void set_id(hpx::id_type const& id) - { - event_id = id; - } - - public: - hpx::naming::gid_type get_device_gid() const - { - HPX_ASSERT(device_id); - return device_id.get_gid(); - } - - hpx::naming::id_type get_event_id() const - { - HPX_ASSERT(event_id); - return event_id; - } - - private: - hpx::naming::id_type device_id; - hpx::naming::id_type event_id; - }; - - /////////////////////////////////////////////////////////////////////////// - template <> - class event_data - : public hpx::lcos::detail::future_data - { - private: - typedef hpx::lcos::detail::future_data parent_type; - typedef parent_type::result_type result_type; - - public: - typedef typename parent_type::init_no_addref init_no_addref; - - event_data() - : is_armed(false) - { - } - - event_data(init_no_addref no_addref) - : is_armed(false) - , parent_type(no_addref) - { - } - - ~event_data() - { - HPX_ASSERT(device_id && event_id); - unregister_event( device_id, event_id.get_gid() ); - } - - void init(hpx::naming::id_type && device_id_) - { - device_id = std::move(device_id_); - } - - void set_id(hpx::id_type const& id) - { - event_id = id; - } - - private: - boost::atomic is_armed; - - HPX_OPENCL_EXPORT void arm(); - - public: - // Gets called by when_all, wait_all, etc - void execute_deferred(error_code& ec = throws) - { - if(!is_armed.exchange(true)) - arm(); - } - - // retrieving the value - result_type* get_result(error_code& ec = throws) - { - this->execute_deferred(); - return this->parent_type::get_result(ec); - } - - // wait for the value - void wait(error_code& ec = throws) - { - this->execute_deferred(); - this->parent_type::wait(ec); - } - - hpx::lcos::future_status - wait_until(hpx::util::steady_clock::time_point const& abs_time, - error_code& ec = throws) - { - this->execute_deferred(); - return this->parent_type::wait_until(abs_time, ec); - } - - public: - hpx::naming::gid_type get_device_gid() const - { - HPX_ASSERT(device_id); - return device_id.get_gid(); - } - - hpx::naming::id_type get_event_id() const - { - HPX_ASSERT(event_id); - return event_id; - } - - private: - hpx::naming::id_type device_id; - hpx::naming::id_type event_id; - }; -}}}} +/////////////////////////////////////////////////////////////////////////// +template +class event_data : public hpx::lcos::detail::future_data { + private: + typedef hpx::lcos::detail::future_data parent_type; + typedef typename parent_type::result_type result_type; + + public: + typedef typename parent_type::init_no_addref init_no_addref; + + event_data() {} + + event_data(init_no_addref no_addref) : parent_type(no_addref) {} + + ~event_data() { + HPX_ASSERT(device_id && event_id); + unregister_event(device_id, event_id.get_gid()); + } + + void init(hpx::naming::id_type&& device_id_) { + device_id = std::move(device_id_); + } + + void set_id(hpx::id_type const& id) { event_id = id; } + + public: + hpx::naming::gid_type get_device_gid() const { + HPX_ASSERT(device_id); + return device_id.get_gid(); + } + + hpx::naming::id_type get_event_id() const { + HPX_ASSERT(event_id); + return event_id; + } + + private: + hpx::naming::id_type device_id; + hpx::naming::id_type event_id; +}; + +/////////////////////////////////////////////////////////////////////////// +template <> +class event_data + : public hpx::lcos::detail::future_data { + private: + typedef hpx::lcos::detail::future_data parent_type; + typedef parent_type::result_type result_type; + + public: + typedef typename parent_type::init_no_addref init_no_addref; + + event_data() : is_armed(false) {} + + event_data(init_no_addref no_addref) + : is_armed(false), parent_type(no_addref) {} + + ~event_data() { + HPX_ASSERT(device_id && event_id); + unregister_event(device_id, event_id.get_gid()); + } + + void init(hpx::naming::id_type&& device_id_) { + device_id = std::move(device_id_); + } + + void set_id(hpx::id_type const& id) { event_id = id; } + + private: + boost::atomic is_armed; + + HPX_OPENCL_EXPORT void arm(); + + public: + // Gets called by when_all, wait_all, etc + void execute_deferred(error_code& ec = throws) { + if (!is_armed.exchange(true)) arm(); + } + + // retrieving the value + result_type* get_result(error_code& ec = throws) { + this->execute_deferred(); + return this->parent_type::get_result(ec); + } + + // wait for the value + void wait(error_code& ec = throws) { + this->execute_deferred(); + this->parent_type::wait(ec); + } + + hpx::lcos::future_status wait_until( + hpx::util::steady_clock::time_point const& abs_time, + error_code& ec = throws) { + this->execute_deferred(); + return this->parent_type::wait_until(abs_time, ec); + } + + public: + hpx::naming::gid_type get_device_gid() const { + HPX_ASSERT(device_id); + return device_id.get_gid(); + } + + hpx::naming::id_type get_event_id() const { + HPX_ASSERT(event_id); + return event_id; + } + + private: + hpx::naming::id_type device_id; + hpx::naming::id_type event_id; +}; +} // namespace detail +} // namespace lcos +} // namespace opencl +} // namespace hpx /////////////////////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl { namespace lcos -{ - /////////////////////////////////////////////////////////////////////////// - template - class event - : hpx::lcos::detail::promise_base< - Result, RemoteResult, detail::event_data > - { - typedef hpx::lcos::detail::promise_base< - Result, RemoteResult, detail::event_data - > base_type; - - public: - typedef typename base_type::shared_state_type shared_state_type; - typedef Result result_type; - - /// Construct a new \a event instance. The supplied - /// \a thread will be notified as soon as the result of the - /// operation associated with this future instance has been - /// returned. - /// - /// \note The result of the requested operation is expected to - /// be returned as the first parameter using a - /// \a base_lco#set_value action. Any error has to be - /// reported using a \a base_lco::set_exception action. The - /// target for either of these actions has to be this - /// future instance (as it has to be sent along - /// with the action as the continuation parameter). - event(hpx::naming::id_type device_id) - : base_type() - { - this->shared_state_->init(std::move(device_id)); - } - - public: - /// Reset the event to allow to restart an asynchronous - /// operation. Allows any subsequent set_data operation to succeed. - void reset() - { - this->shared_state_->reset(); - this->future_retrieved_ = false; - } - - /// \brief Return the global id of this \a future instance - naming::id_type get_event_id() const - { - return this->shared_state_->get_event_id(); - } - - /// Return whether or not the data is available for this - /// \a event. - bool is_ready() const - { - return this->shared_state_->is_ready(); - } - - /// Return whether this instance has been properly initialized - using base_type::valid; - - using base_type::get_future; - }; - - /////////////////////////////////////////////////////////////////////////// - template <> - class event - : hpx::lcos::detail::promise_base< - void, hpx::util::unused_type, - detail::event_data > - { - typedef hpx::lcos::detail::promise_base< - void, hpx::util::unused_type, - detail::event_data - > base_type; - - public: - typedef base_type::shared_state_type shared_state_type; - typedef hpx::util::unused_type result_type; - - /// Construct a new \a event instance. The supplied - /// \a thread will be notified as soon as the result of the - /// operation associated with this future instance has been - /// returned. - /// - /// \note The result of the requested operation is expected to - /// be returned as the first parameter using a - /// \a base_lco#set_value action. Any error has to be - /// reported using a \a base_lco::set_exception action. The - /// target for either of these actions has to be this - /// future instance (as it has to be sent along - /// with the action as the continuation parameter). - event(hpx::naming::id_type device_id) - : base_type() - { - this->shared_state_->init(std::move(device_id)); - } - - public: - /// Reset the event to allow to restart an asynchronous - /// operation. Allows any subsequent set_data operation to succeed. - void reset() - { - this->shared_state_->reset(); - this->future_retrieved_ = false; - } - - /// \brief Return the global id of this \a future instance - naming::id_type get_event_id() const - { - return this->shared_state_->get_event_id(); - } - - /// Return whether or not the data is available for this \a event. - bool is_ready() const - { - return this->shared_state_->is_ready(); - } - - using base_type::get_future; - }; -}}} +namespace hpx { +namespace opencl { +namespace lcos { +/////////////////////////////////////////////////////////////////////////// +template +class event + : hpx::lcos::detail::promise_base< + Result, RemoteResult, detail::event_data > { + typedef hpx::lcos::detail::promise_base< + Result, RemoteResult, detail::event_data > + base_type; + + public: + typedef typename base_type::shared_state_type shared_state_type; + typedef Result result_type; + + /// Construct a new \a event instance. The supplied + /// \a thread will be notified as soon as the result of the + /// operation associated with this future instance has been + /// returned. + /// + /// \note The result of the requested operation is expected to + /// be returned as the first parameter using a + /// \a base_lco#set_value action. Any error has to be + /// reported using a \a base_lco::set_exception action. The + /// target for either of these actions has to be this + /// future instance (as it has to be sent along + /// with the action as the continuation parameter). + event(hpx::naming::id_type device_id) : base_type() { + this->shared_state_->init(std::move(device_id)); + } + + public: + /// Reset the event to allow to restart an asynchronous + /// operation. Allows any subsequent set_data operation to succeed. + void reset() { + this->shared_state_->reset(); + this->future_retrieved_ = false; + } + + /// \brief Return the global id of this \a future instance + naming::id_type get_event_id() const { + return this->shared_state_->get_event_id(); + } + + /// Return whether or not the data is available for this + /// \a event. + bool is_ready() const { return this->shared_state_->is_ready(); } + + /// Return whether this instance has been properly initialized + using base_type::valid; + + using base_type::get_future; +}; + +/////////////////////////////////////////////////////////////////////////// +template <> +class event + : hpx::lcos::detail::promise_base< + void, hpx::util::unused_type, + detail::event_data > { + typedef hpx::lcos::detail::promise_base< + void, hpx::util::unused_type, + detail::event_data > + base_type; + + public: + typedef base_type::shared_state_type shared_state_type; + typedef hpx::util::unused_type result_type; + + /// Construct a new \a event instance. The supplied + /// \a thread will be notified as soon as the result of the + /// operation associated with this future instance has been + /// returned. + /// + /// \note The result of the requested operation is expected to + /// be returned as the first parameter using a + /// \a base_lco#set_value action. Any error has to be + /// reported using a \a base_lco::set_exception action. The + /// target for either of these actions has to be this + /// future instance (as it has to be sent along + /// with the action as the continuation parameter). + event(hpx::naming::id_type device_id) : base_type() { + this->shared_state_->init(std::move(device_id)); + } + + public: + /// Reset the event to allow to restart an asynchronous + /// operation. Allows any subsequent set_data operation to succeed. + void reset() { + this->shared_state_->reset(); + this->future_retrieved_ = false; + } + + /// \brief Return the global id of this \a future instance + naming::id_type get_event_id() const { + return this->shared_state_->get_event_id(); + } + + /// Return whether or not the data is available for this \a event. + bool is_ready() const { return this->shared_state_->is_ready(); } + + using base_type::get_future; +}; +} // namespace lcos +} // namespace opencl +} // namespace hpx #endif diff --git a/opencl/lcos/zerocopy_buffer.hpp b/opencl/lcos/zerocopy_buffer.hpp index b6354622..9af6aba2 100644 --- a/opencl/lcos/zerocopy_buffer.hpp +++ b/opencl/lcos/zerocopy_buffer.hpp @@ -12,99 +12,97 @@ #include "../util/rect_props.hpp" -namespace hpx { namespace opencl { namespace lcos -{ - //---------------------------------------------------------------------------- - // A custom allocator which takes a pointer in its constructor and then returns - // this pointer in response to any allocate request. It is here to try to fool - // the hpx serialization into copying directly into a user provided buffer - // without copying from a result into another buffer. - // - class zerocopy_buffer - : public hpx::serialization::serialize_buffer - { - typedef hpx::serialization::serialize_buffer base_type; - - public: - zerocopy_buffer() BOOST_NOEXCEPT - : pointer_(0), size_x(0), size_y(0), size_z(0) - { - } - - zerocopy_buffer(std::uintptr_t p, std::size_t size, - hpx::serialization::serialize_buffer buffer) - : base_type(buffer), pointer_(p), size_x(size), - size_y(1), size_z(1), stride_y(0), stride_z(0) - { - HPX_ASSERT(this->base_type::size() == size_x * size_y * size_z); - } - - zerocopy_buffer( std::uintptr_t p, - const hpx::opencl::rect_props & rect, - std::size_t elem_size, - hpx::serialization::serialize_buffer buffer) - : base_type(buffer), - pointer_(p), - size_x(rect.size_x * elem_size), - size_y(rect.size_y), - size_z(rect.size_z), - stride_y(rect.dst_stride_y * elem_size), - stride_z(rect.dst_stride_z * elem_size) - { - // add origin position to pointer_. reduces network traffic - // as dst_x, dst_y and dst_z don't need to be transmitted. - pointer_ += rect.dst_x + stride_y*rect.dst_y + stride_z*rect.dst_z; - - HPX_ASSERT(this->base_type::size() == size_x * size_y * size_z); - } - - private: - // serialization support - friend class hpx::serialization::access; - - template - void load(Archive& ar, unsigned int const version) - { - // deliberately don't serialize base class - - // read size and address - ar >> size_x >> size_y >> size_z >> stride_y >> stride_z >> pointer_; - - // write data to address - char* dest_addr = reinterpret_cast(pointer_); - for(std::size_t z = 0; z < size_z; z++) - { - for(std::size_t y = 0; y < size_y; y++) - { - ar >> hpx::serialization::make_array( - dest_addr + y * stride_y + z * stride_z, - size_x); - } - } - } - - template - void save(Archive& ar, unsigned int const version) const - { - // deliberately don't serialize base class - - // send size, address and data - ar << size_x << size_y << size_z << stride_y << stride_z << pointer_; - ar << hpx::serialization::make_array( - this->base_type::data(), this->base_type::size() ); - } - - HPX_SERIALIZATION_SPLIT_MEMBER() - - private: - std::uintptr_t pointer_; - std::size_t size_x; - std::size_t size_y; - std::size_t size_z; - std::size_t stride_y; - std::size_t stride_z; - hpx::serialization::serialize_buffer buffer_; - }; -}}} +namespace hpx { +namespace opencl { +namespace lcos { +//---------------------------------------------------------------------------- +// A custom allocator which takes a pointer in its constructor and then returns +// this pointer in response to any allocate request. It is here to try to fool +// the hpx serialization into copying directly into a user provided buffer +// without copying from a result into another buffer. +// +class zerocopy_buffer : public hpx::serialization::serialize_buffer { + typedef hpx::serialization::serialize_buffer base_type; + + public: + zerocopy_buffer() BOOST_NOEXCEPT : pointer_(0), + size_x(0), + size_y(0), + size_z(0) {} + + zerocopy_buffer(std::uintptr_t p, std::size_t size, + hpx::serialization::serialize_buffer buffer) + : base_type(buffer), + pointer_(p), + size_x(size), + size_y(1), + size_z(1), + stride_y(0), + stride_z(0) { + HPX_ASSERT(this->base_type::size() == size_x * size_y * size_z); + } + + zerocopy_buffer(std::uintptr_t p, const hpx::opencl::rect_props& rect, + std::size_t elem_size, + hpx::serialization::serialize_buffer buffer) + : base_type(buffer), + pointer_(p), + size_x(rect.size_x * elem_size), + size_y(rect.size_y), + size_z(rect.size_z), + stride_y(rect.dst_stride_y * elem_size), + stride_z(rect.dst_stride_z * elem_size) { + // add origin position to pointer_. reduces network traffic + // as dst_x, dst_y and dst_z don't need to be transmitted. + pointer_ += rect.dst_x + stride_y * rect.dst_y + stride_z * rect.dst_z; + + HPX_ASSERT(this->base_type::size() == size_x * size_y * size_z); + } + + private: + // serialization support + friend class hpx::serialization::access; + + template + void load(Archive& ar, unsigned int const version) { + // deliberately don't serialize base class + + // read size and address + ar >> size_x >> size_y >> size_z >> stride_y >> stride_z >> pointer_; + + // write data to address + char* dest_addr = reinterpret_cast(pointer_); + for (std::size_t z = 0; z < size_z; z++) { + for (std::size_t y = 0; y < size_y; y++) { + ar >> hpx::serialization::make_array( + dest_addr + y * stride_y + z * stride_z, size_x); + } + } + } + + template + void save(Archive& ar, unsigned int const version) const { + // deliberately don't serialize base class + + // send size, address and data + ar << size_x << size_y << size_z << stride_y << stride_z << pointer_; + ar << hpx::serialization::make_array(this->base_type::data(), + this->base_type::size()); + } + + HPX_SERIALIZATION_SPLIT_MEMBER() + + private: + std::uintptr_t pointer_; + std::size_t size_x; + std::size_t size_y; + std::size_t size_z; + std::size_t stride_y; + std::size_t stride_z; + hpx::serialization::serialize_buffer buffer_; +}; +} // namespace lcos +} // namespace opencl +} // namespace hpx #endif diff --git a/opencl/program.cpp b/opencl/program.cpp index 132f1aec..c644c764 100644 --- a/opencl/program.cpp +++ b/opencl/program.cpp @@ -13,70 +13,48 @@ using hpx::opencl::program; -void program::ensure_device_id() const -{ - if (!device_gid) - { - typedef - hpx::opencl::server::program::get_parent_device_id_action - action_type; - HPX_ASSERT(this->get_id()); - device_gid = async(this->get_id()).get(); - } +void program::ensure_device_id() const { + if (!device_gid) { + typedef hpx::opencl::server::program::get_parent_device_id_action + action_type; + HPX_ASSERT(this->get_id()); + device_gid = async(this->get_id()).get(); + } } -void -program::build() const -{ - build_async("").get(); -} +void program::build() const { build_async("").get(); } -void -program::build(std::string build_options) const -{ - build_async(std::move(build_options)).get(); +void program::build(std::string build_options) const { + build_async(std::move(build_options)).get(); } -hpx::lcos::future -program::build_async() const -{ - return build_async(""); -} +hpx::lcos::future program::build_async() const { return build_async(""); } -hpx::lcos::future -program::build_async(std::string build_options) const -{ - HPX_ASSERT(this->get_id()); +hpx::lcos::future program::build_async(std::string build_options) const { + HPX_ASSERT(this->get_id()); - typedef hpx::opencl::server::program::build_action func; + typedef hpx::opencl::server::program::build_action func; - return async(this->get_id(), build_options); + return async(this->get_id(), build_options); } hpx::lcos::future > -program::get_binary() const -{ - HPX_ASSERT(this->get_id()); +program::get_binary() const { + HPX_ASSERT(this->get_id()); - typedef hpx::opencl::server::program::get_binary_action func; + typedef hpx::opencl::server::program::get_binary_action func; - return async(this->get_id()); + return async(this->get_id()); } -hpx::opencl::kernel -program::create_kernel(std::string kernel_name) const -{ +hpx::opencl::kernel program::create_kernel(std::string kernel_name) const { + HPX_ASSERT(this->get_id()); - HPX_ASSERT(this->get_id()); - - typedef hpx::opencl::server::program::create_kernel_action func; - - hpx::future kernel_server = - hpx::async(this->get_id(), kernel_name); + typedef hpx::opencl::server::program::create_kernel_action func; - ensure_device_id(); - return kernel(std::move(kernel_server), device_gid); + hpx::future kernel_server = + hpx::async(this->get_id(), kernel_name); + ensure_device_id(); + return kernel(std::move(kernel_server), device_gid); } - - diff --git a/opencl/program.hpp b/opencl/program.hpp index 611cf1fb..d4aa7d5e 100644 --- a/opencl/program.hpp +++ b/opencl/program.hpp @@ -20,117 +20,111 @@ namespace hpx { namespace opencl { - - ////////////////////////////////////// - /// @brief An OpenCL program consisting of one or multiple kernels. - /// - /// Every program belongs to one \ref device. - /// - class HPX_OPENCL_EXPORT program - : public hpx::components::client_base - { - - typedef hpx::components::client_base base_type; - - public: - // Empty constructor, necessary for hpx purposes - program(){} - - // Constructor - program(hpx::shared_future const& gid, - hpx::naming::id_type device_gid_) - : base_type(gid), device_gid(std::move(device_gid_)) - {} - - program(hpx::future && gid) - : base_type(std::move(gid)), device_gid() - {} - - // initialization - - - // /////////////////////////////////////////////// - // Exposed Component functionality - // - - /** - * @brief Builds the program, non-blocking. - * - * @return A future that will trigger upon build completion. - */ - hpx::lcos::future build_async() const; - - /** - * @brief Builds the program, non-blocking. - * - * @param build_options A string with specific build options.
- * Look at the official - * OpenCL Reference - * for further information. - * @return A future that will trigger upon build completion. - */ - hpx::lcos::future build_async(std::string build_options) const; - - /** - * @brief Builds the program, blocking. - * - * @return A future that will trigger upon build completion. - */ - void build() const; - - /** - * @brief Builds the program, blocking. - * - * @param build_options A string with specific build options.
- * Look at the official - * OpenCL Reference - * for further information. - * @return A future that will trigger upon build completion. - */ - void build(std::string build_options) const; - - /** - * @brief Retrieves the binary of a built program. - * It can be used to create programs with - * device::create_program_with_binary(). - * - * @return A future to the binary code - */ - hpx::lcos::future > - get_binary() const; - - /** - * @brief Creates a kernel. - * - * The kernel with the name kernel_name has to be defined and - * implemented in the program source code. - * - * @param kernel_name The name of the kernel to be created - * @return A kernel object. - */ - hpx::opencl::kernel - create_kernel(std::string kernel_name) const; - - protected: - void ensure_device_id() const; - - private: - mutable hpx::naming::id_type device_gid; - - private: - // serialization support - friend class hpx::serialization::access; - - template - void serialize(Archive & ar, unsigned) - { - HPX_ASSERT(device_gid); - ar & hpx::serialization::base_object(*this); - ar & device_gid; - } - - }; - -}} +////////////////////////////////////// +/// @brief An OpenCL program consisting of one or multiple kernels. +/// +/// Every program belongs to one \ref device. +/// +class HPX_OPENCL_EXPORT program + : public hpx::components::client_base { + typedef hpx::components::client_base base_type; + + public: + // Empty constructor, necessary for hpx purposes + program() {} + + // Constructor + program(hpx::shared_future const& gid, + hpx::naming::id_type device_gid_) + : base_type(gid), device_gid(std::move(device_gid_)) {} + + program(hpx::future&& gid) + : base_type(std::move(gid)), device_gid() {} + + // initialization + + // /////////////////////////////////////////////// + // Exposed Component functionality + // + + /** + * @brief Builds the program, non-blocking. + * + * @return A future that will trigger upon build completion. + */ + hpx::lcos::future build_async() const; + + /** + * @brief Builds the program, non-blocking. + * + * @param build_options A string with specific build options.
+ * Look at the official + * OpenCL + * Reference for further information. + * @return A future that will trigger upon build completion. + */ + hpx::lcos::future build_async(std::string build_options) const; + + /** + * @brief Builds the program, blocking. + * + * @return A future that will trigger upon build completion. + */ + void build() const; + + /** + * @brief Builds the program, blocking. + * + * @param build_options A string with specific build options.
+ * Look at the official + * OpenCL + * Reference for further information. + * @return A future that will trigger upon build completion. + */ + void build(std::string build_options) const; + + /** + * @brief Retrieves the binary of a built program. + * It can be used to create programs with + * device::create_program_with_binary(). + * + * @return A future to the binary code + */ + hpx::lcos::future > get_binary() + const; + + /** + * @brief Creates a kernel. + * + * The kernel with the name kernel_name has to be defined and + * implemented in the program source code. + * + * @param kernel_name The name of the kernel to be created + * @return A kernel object. + */ + hpx::opencl::kernel create_kernel(std::string kernel_name) const; + + protected: + void ensure_device_id() const; + + private: + mutable hpx::naming::id_type device_gid; + + private: + // serialization support + friend class hpx::serialization::access; + + template + void serialize(Archive& ar, unsigned) { + HPX_ASSERT(device_gid); + ar& hpx::serialization::base_object(*this); + ar& device_gid; + } +}; + +} // namespace opencl +} // namespace hpx #endif diff --git a/opencl/server/buffer.hpp b/opencl/server/buffer.hpp index 938494d6..b0ba7ee2 100644 --- a/opencl/server/buffer.hpp +++ b/opencl/server/buffer.hpp @@ -7,7 +7,6 @@ #ifndef HPX_OPENCL_SERVER_BUFFER_HPP #define HPX_OPENCL_SERVER_BUFFER_HPP - #include #include @@ -22,232 +21,199 @@ #include "util/server_definitions.hpp" #include "../util/rect_props.hpp" -namespace hpx { namespace opencl { namespace server { - - // ///////////////////////////////////////////////////// - // This class represents an opencl buffer. - - class HPX_OPENCL_EXPORT buffer - : public hpx::components::managed_component_base - { - public: - - // Constructor - buffer(); - // Destructor - ~buffer(); - - /////////////////////////////////////////////////// - /// Local functions - /// - void init(hpx::naming::id_type device_id, cl_mem_flags flags, - std::size_t size); - - cl_mem get_cl_mem(); - - ////////////////////////////////////////////////// - /// Exposed functionality of this component - /// - // Returns the size of the buffer - std::size_t size(); - - // Returns the parent device - hpx::naming::id_type get_parent_device_id(); - - // Writes to the buffer - template - void enqueue_write( hpx::naming::id_type && event_gid, - std::size_t offset, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ); - - // Writes to the buffer - template - void enqueue_write_rect( - hpx::naming::id_type && event_gid, - hpx::opencl::rect_props && rect_properties, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ); - - // Reads from the buffer - void enqueue_read( hpx::naming::id_type && event_gid, - std::size_t offset, - std::size_t size, - std::vector && dependencies ); - - // Reads from the buffer. Needed for direct copy to user-supplied buffer - template - void enqueue_read_to_userbuffer_remote( - hpx::naming::id_type && event_gid, - std::size_t offset, - std::size_t size, - std::uintptr_t remote_data_addr, - std::vector && dependencies ); - - // Reads from the buffer. Needed for direct copy to user-supplied buffer - template - void enqueue_read_to_userbuffer_local( - hpx::naming::id_type && event_gid, - std::size_t offset, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ); - - // Reads from the buffer. Needed for direct copy to user-supplied buffer - template - void enqueue_read_to_userbuffer_rect_remote( - hpx::naming::id_type && event_gid, - hpx::opencl::rect_props && rect_properties, - std::uintptr_t remote_data_addr, - std::vector && dependencies ); - - // Reads from the buffer. Needed for direct copy to user-supplied buffer - template - void enqueue_read_to_userbuffer_rect_local( - hpx::naming::id_type && event_gid, - hpx::opencl::rect_props && rect_properties, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ); - - // Copies data from this buffer to a remote buffer - void enqueue_send( hpx::naming::id_type dst, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - std::vector && dependencies, - std::vector && - dependency_devices ); - - // Copies data from this buffer to a remote buffer - void enqueue_send_rect( - hpx::naming::id_type dst, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - rect_props rect_properties, - std::vector && dependencies, - std::vector && - dependency_devices ); - - // Different versions of enqueue_send, optimized for different - // runtime scenarios - void send_bruteforce( - hpx::naming::id_type && dst, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - std::vector && src_dependencies, - std::vector && dst_dependencies ); - void send_direct( - hpx::naming::id_type && dst, - std::shared_ptr && dst_buffer, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - std::size_t src_offset, - std::size_t dst_offset, +namespace hpx { +namespace opencl { +namespace server { + +// ///////////////////////////////////////////////////// +// This class represents an opencl buffer. + +class HPX_OPENCL_EXPORT buffer + : public hpx::components::managed_component_base { + public: + // Constructor + buffer(); + // Destructor + ~buffer(); + + /////////////////////////////////////////////////// + /// Local functions + /// + void init(hpx::naming::id_type device_id, cl_mem_flags flags, + std::size_t size); + + cl_mem get_cl_mem(); + + ////////////////////////////////////////////////// + /// Exposed functionality of this component + /// + // Returns the size of the buffer + std::size_t size(); + + // Returns the parent device + hpx::naming::id_type get_parent_device_id(); + + // Writes to the buffer + template + void enqueue_write(hpx::naming::id_type &&event_gid, std::size_t offset, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies); + + // Writes to the buffer + template + void enqueue_write_rect(hpx::naming::id_type &&event_gid, + hpx::opencl::rect_props &&rect_properties, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies); + + // Reads from the buffer + void enqueue_read(hpx::naming::id_type &&event_gid, std::size_t offset, std::size_t size, - std::vector && src_dependencies, - std::vector && dst_dependencies ); - void send_rect_bruteforce( - hpx::naming::id_type && dst, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - rect_props && rect_properties, - std::vector && src_dependencies, - std::vector && dst_dependencies ); - void send_rect_direct( - hpx::naming::id_type && dst, - std::shared_ptr && dst_buffer, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - rect_props && rect_properties, - std::vector && src_dependencies, - std::vector && dst_dependencies ); - - - HPX_DEFINE_COMPONENT_ACTION(buffer, size); - HPX_DEFINE_COMPONENT_ACTION(buffer, get_parent_device_id); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_read); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_send); - HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_send_rect); - - // Actions with template arguments (see enqueue_write<>() above) require - // special type definitions. The simplest way to define such an action type - // is by deriving from the HPX facility make_action: - template - struct enqueue_write_action - : hpx::actions::make_action, - std::vector &&), - &buffer::template enqueue_write, enqueue_write_action > - {}; - template - struct enqueue_write_rect_action + std::vector &&dependencies); + + // Reads from the buffer. Needed for direct copy to user-supplied buffer + template + void enqueue_read_to_userbuffer_remote( + hpx::naming::id_type &&event_gid, std::size_t offset, std::size_t size, + std::uintptr_t remote_data_addr, + std::vector &&dependencies); + + // Reads from the buffer. Needed for direct copy to user-supplied buffer + template + void enqueue_read_to_userbuffer_local( + hpx::naming::id_type &&event_gid, std::size_t offset, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies); + + // Reads from the buffer. Needed for direct copy to user-supplied buffer + template + void enqueue_read_to_userbuffer_rect_remote( + hpx::naming::id_type &&event_gid, + hpx::opencl::rect_props &&rect_properties, + std::uintptr_t remote_data_addr, + std::vector &&dependencies); + + // Reads from the buffer. Needed for direct copy to user-supplied buffer + template + void enqueue_read_to_userbuffer_rect_local( + hpx::naming::id_type &&event_gid, + hpx::opencl::rect_props &&rect_properties, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies); + + // Copies data from this buffer to a remote buffer + void enqueue_send(hpx::naming::id_type dst, hpx::naming::id_type &&src_event, + hpx::naming::id_type &&dst_event, std::size_t src_offset, + std::size_t dst_offset, std::size_t size, + std::vector &&dependencies, + std::vector &&dependency_devices); + + // Copies data from this buffer to a remote buffer + void enqueue_send_rect( + hpx::naming::id_type dst, hpx::naming::id_type &&src_event, + hpx::naming::id_type &&dst_event, rect_props rect_properties, + std::vector &&dependencies, + std::vector &&dependency_devices); + + // Different versions of enqueue_send, optimized for different + // runtime scenarios + void send_bruteforce(hpx::naming::id_type &&dst, + hpx::naming::id_type &&src_event, + hpx::naming::id_type &&dst_event, std::size_t src_offset, + std::size_t dst_offset, std::size_t size, + std::vector &&src_dependencies, + std::vector &&dst_dependencies); + void send_direct(hpx::naming::id_type &&dst, + std::shared_ptr &&dst_buffer, + hpx::naming::id_type &&src_event, + hpx::naming::id_type &&dst_event, std::size_t src_offset, + std::size_t dst_offset, std::size_t size, + std::vector &&src_dependencies, + std::vector &&dst_dependencies); + void send_rect_bruteforce( + hpx::naming::id_type &&dst, hpx::naming::id_type &&src_event, + hpx::naming::id_type &&dst_event, rect_props &&rect_properties, + std::vector &&src_dependencies, + std::vector &&dst_dependencies); + void send_rect_direct( + hpx::naming::id_type &&dst, + std::shared_ptr &&dst_buffer, + hpx::naming::id_type &&src_event, hpx::naming::id_type &&dst_event, + rect_props &&rect_properties, + std::vector &&src_dependencies, + std::vector &&dst_dependencies); + + HPX_DEFINE_COMPONENT_ACTION(buffer, size); + HPX_DEFINE_COMPONENT_ACTION(buffer, get_parent_device_id); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_read); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_send); + HPX_DEFINE_COMPONENT_ACTION(buffer, enqueue_send_rect); + + // Actions with template arguments (see enqueue_write<>() above) require + // special type definitions. The simplest way to define such an action type + // is by deriving from the HPX facility make_action: + template + struct enqueue_write_action + : hpx::actions::make_action< + void (buffer::*)(hpx::naming::id_type &&, std::size_t, + hpx::serialization::serialize_buffer, + std::vector &&), + &buffer::template enqueue_write, enqueue_write_action > {}; + template + struct enqueue_write_rect_action : hpx::actions::make_action, - std::vector &&), - &buffer::template enqueue_write_rect, enqueue_write_rect_action > - {}; - template - struct enqueue_read_to_userbuffer_remote_action - : hpx::actions::make_action &&), + hpx::naming::id_type &&, + hpx::opencl::rect_props &&, + hpx::serialization::serialize_buffer, + std::vector &&), + &buffer::template enqueue_write_rect, + enqueue_write_rect_action > {}; + template + struct enqueue_read_to_userbuffer_remote_action + : hpx::actions::make_action< + void (buffer::*)(hpx::naming::id_type &&, std::size_t, std::size_t, + std::uintptr_t, + std::vector &&), &buffer::template enqueue_read_to_userbuffer_remote, - enqueue_read_to_userbuffer_remote_action > - {}; - template - struct enqueue_read_to_userbuffer_local_action - : hpx::actions::make_action, - std::vector &&), + enqueue_read_to_userbuffer_remote_action > {}; + template + struct enqueue_read_to_userbuffer_local_action + : hpx::actions::make_action< + void (buffer::*)(hpx::naming::id_type &&, std::size_t, + hpx::serialization::serialize_buffer, + std::vector &&), &buffer::template enqueue_read_to_userbuffer_local, - enqueue_read_to_userbuffer_local_action > - {}; - template - struct enqueue_read_to_userbuffer_rect_remote_action - : hpx::actions::make_action &&), + enqueue_read_to_userbuffer_local_action > {}; + template + struct enqueue_read_to_userbuffer_rect_remote_action + : hpx::actions::make_action< + void (buffer::*)(hpx::naming::id_type &&, + hpx::opencl::rect_props &&, std::uintptr_t, + std::vector &&), &buffer::template enqueue_read_to_userbuffer_rect_remote, - enqueue_read_to_userbuffer_rect_remote_action > - {}; - template - struct enqueue_read_to_userbuffer_rect_local_action - : hpx::actions::make_action, - std::vector &&), + enqueue_read_to_userbuffer_rect_remote_action > {}; + template + struct enqueue_read_to_userbuffer_rect_local_action + : hpx::actions::make_action< + void (buffer::*)(hpx::naming::id_type &&, + hpx::opencl::rect_props &&, + hpx::serialization::serialize_buffer, + std::vector &&), &buffer::template enqueue_read_to_userbuffer_rect_local, - enqueue_read_to_userbuffer_rect_local_action > - {}; - - - ////////////////////////////////////////////////// - // Private Member Variables - // - private: - std::shared_ptr parent_device; - cl_mem device_mem; - hpx::naming::id_type parent_device_id; + enqueue_read_to_userbuffer_rect_local_action > {}; - }; + ////////////////////////////////////////////////// + // Private Member Variables + // + private: + std::shared_ptr parent_device; + cl_mem device_mem; + hpx::naming::id_type parent_device_id; +}; -}}} +} // namespace server +} // namespace opencl +} // namespace hpx //[opencl_management_registration_declarations HPX_REGISTER_ACTION_DECLARATION( @@ -260,16 +226,15 @@ HPX_OPENCL_REGISTER_ACTION_DECLARATION(buffer, enqueue_send_rect); HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, enqueue_write); HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, enqueue_write_rect); HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, - enqueue_read_to_userbuffer_local); + enqueue_read_to_userbuffer_local); HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, - enqueue_read_to_userbuffer_remote); -HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, - enqueue_read_to_userbuffer_rect_local); -HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, - enqueue_read_to_userbuffer_rect_remote); + enqueue_read_to_userbuffer_remote); +HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK( + buffer, enqueue_read_to_userbuffer_rect_local); +HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK( + buffer, enqueue_read_to_userbuffer_rect_remote); //] - //////////////////////////////////////////////////////////////////////////////// // IMPLEMENTATIONS // @@ -281,338 +246,296 @@ HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(buffer, #include "util/event_dependencies.hpp" #include "device.hpp" - template -void -hpx::opencl::server::buffer::enqueue_write( - hpx::naming::id_type && event_gid, - std::size_t offset, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ){ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - cl_event return_event; +void hpx::opencl::server::buffer::enqueue_write( + hpx::naming::id_type &&event_gid, std::size_t offset, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); + cl_int err; + cl_event return_event; - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_write_command_queue(); + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); - // run the OpenCL-call - err = clEnqueueWriteBuffer( command_queue, device_mem, CL_FALSE, offset, - data.size()*sizeof(T), data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueWriteBuffer()"); + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_write_command_queue(); - // register the data to prevent deallocation - parent_device->put_event_data(return_event, data); + // run the OpenCL-call + err = clEnqueueWriteBuffer(command_queue, device_mem, CL_FALSE, offset, + data.size() * sizeof(T), data.data(), + static_cast(events.size()), + events.get_cl_events(), &return_event); + cl_ensure(err, "clEnqueueWriteBuffer()"); - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); + // register the data to prevent deallocation + parent_device->put_event_data(return_event, data); + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); } - template -void -hpx::opencl::server::buffer::enqueue_write_rect( - hpx::naming::id_type && event_gid, - hpx::opencl::rect_props && rect_properties, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ){ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - cl_event return_event; - - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); - - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_write_command_queue(); - - // prepare arguments for OpenCL call - std::size_t host_origin[] = { rect_properties.src_x * sizeof(T), - rect_properties.src_y, - rect_properties.src_z }; - std::size_t buffer_origin[] = { rect_properties.dst_x * sizeof(T), - rect_properties.dst_y, - rect_properties.dst_z }; - std::size_t region[] = { rect_properties.size_x * sizeof(T), - rect_properties.size_y, - rect_properties.size_z }; - - HPX_ASSERT(data.size() > - (rect_properties.size_x + rect_properties.src_x - 1) - + (rect_properties.size_y + rect_properties.src_y - 1) - * rect_properties.src_stride_y - + (rect_properties.size_z + rect_properties.src_z - 1) - * rect_properties.src_stride_z ); - - // run the OpenCL-call - err = clEnqueueWriteBufferRect( command_queue, device_mem, CL_FALSE, - buffer_origin, host_origin, region, - rect_properties.dst_stride_y * sizeof(T), - rect_properties.dst_stride_z * sizeof(T), - rect_properties.src_stride_y * sizeof(T), - rect_properties.src_stride_z * sizeof(T), - data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueWriteBufferRect()"); - - // register the data to prevent deallocation - parent_device->put_event_data(return_event, data); - - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); - +void hpx::opencl::server::buffer::enqueue_write_rect( + hpx::naming::id_type &&event_gid, hpx::opencl::rect_props &&rect_properties, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + cl_int err; + cl_event return_event; + + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_write_command_queue(); + + // prepare arguments for OpenCL call + std::size_t host_origin[] = {rect_properties.src_x * sizeof(T), + rect_properties.src_y, rect_properties.src_z}; + std::size_t buffer_origin[] = {rect_properties.dst_x * sizeof(T), + rect_properties.dst_y, rect_properties.dst_z}; + std::size_t region[] = {rect_properties.size_x * sizeof(T), + rect_properties.size_y, rect_properties.size_z}; + + HPX_ASSERT(data.size() > + (rect_properties.size_x + rect_properties.src_x - 1) + + (rect_properties.size_y + rect_properties.src_y - 1) * + rect_properties.src_stride_y + + (rect_properties.size_z + rect_properties.src_z - 1) * + rect_properties.src_stride_z); + + // run the OpenCL-call + err = clEnqueueWriteBufferRect( + command_queue, device_mem, CL_FALSE, buffer_origin, host_origin, region, + rect_properties.dst_stride_y * sizeof(T), + rect_properties.dst_stride_z * sizeof(T), + rect_properties.src_stride_y * sizeof(T), + rect_properties.src_stride_z * sizeof(T), data.data(), + static_cast(events.size()), events.get_cl_events(), + &return_event); + cl_ensure(err, "clEnqueueWriteBufferRect()"); + + // register the data to prevent deallocation + parent_device->put_event_data(return_event, data); + + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); } - template -void -hpx::opencl::server::buffer::enqueue_read_to_userbuffer_local( - hpx::naming::id_type && event_gid, - std::size_t offset, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ){ +void hpx::opencl::server::buffer::enqueue_read_to_userbuffer_local( + hpx::naming::id_type &&event_gid, std::size_t offset, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + cl_int err; + cl_event return_event; - cl_int err; - cl_event return_event; + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); + // run the OpenCL-call + err = clEnqueueReadBuffer(command_queue, device_mem, CL_FALSE, offset, + data.size() * sizeof(T), data.data(), + static_cast(events.size()), + events.get_cl_events(), &return_event); + cl_ensure(err, "clEnqueueReadBuffer()"); - // run the OpenCL-call - err = clEnqueueReadBuffer( command_queue, device_mem, CL_FALSE, offset, - data.size()*sizeof(T), data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueReadBuffer()"); + // register the data to prevent deallocation + parent_device->put_event_data(return_event, data); - // register the data to prevent deallocation - parent_device->put_event_data(return_event, data); - - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); - - // arm the future. ! this blocks. - parent_device->activate_deferred_event_with_data(event_gid); + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); + // arm the future. ! this blocks. + parent_device->activate_deferred_event_with_data(event_gid); } template -void -hpx::opencl::server::buffer::enqueue_read_to_userbuffer_remote( - hpx::naming::id_type && event_gid, - std::size_t offset, - std::size_t size, +void hpx::opencl::server::buffer::enqueue_read_to_userbuffer_remote( + hpx::naming::id_type &&event_gid, std::size_t offset, std::size_t size, std::uintptr_t remote_data_addr, - std::vector && dependencies ){ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + std::vector &&dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - typedef hpx::serialization::serialize_buffer buffer_type; + typedef hpx::serialization::serialize_buffer buffer_type; - cl_int err; - cl_event return_event; + cl_int err; + cl_event return_event; - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); - // create new target buffer - buffer_type data( size ); + // create new target buffer + buffer_type data(size); - // run the OpenCL-call - err = clEnqueueReadBuffer( command_queue, device_mem, CL_FALSE, offset, - data.size(), data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueReadBuffer()"); + // run the OpenCL-call + err = clEnqueueReadBuffer(command_queue, device_mem, CL_FALSE, offset, + data.size(), data.data(), + static_cast(events.size()), + events.get_cl_events(), &return_event); + cl_ensure(err, "clEnqueueReadBuffer()"); - // put_event_data not necessary as we locally keep the buffer alive until - // the event triggered + // put_event_data not necessary as we locally keep the buffer alive until + // the event triggered - // also important: the cl_event does not get destroyed inside of - // the event map of parent_device, because we keep the lcos::event - // alive as we have an event_id + // also important: the cl_event does not get destroyed inside of + // the event map of parent_device, because we keep the lcos::event + // alive as we have an event_id - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); - // prepare a zero-copy buffer - hpx::opencl::lcos::zerocopy_buffer zerocopy_buffer( remote_data_addr, - size, - data ); + // prepare a zero-copy buffer + hpx::opencl::lcos::zerocopy_buffer zerocopy_buffer(remote_data_addr, size, + data); - // wait for the event to finish - parent_device->wait_for_cl_event(return_event); + // wait for the event to finish + parent_device->wait_for_cl_event(return_event); - // send the zerocopy_buffer to the lcos::event -// typedef hpx::opencl::lcos::detail::set_zerocopy_data_action -// set_data_func; -// hpx::apply_colocated(event_gid, event_gid, zerocopy_buffer); + // send the zerocopy_buffer to the lcos::event + // typedef hpx::opencl::lcos::detail::set_zerocopy_data_action + // set_data_func; + // hpx::apply_colocated(event_gid, event_gid, + // zerocopy_buffer); - hpx::set_lco_value(event_gid, std::move(zerocopy_buffer)); + hpx::set_lco_value(event_gid, std::move(zerocopy_buffer)); } template -void -hpx::opencl::server::buffer::enqueue_read_to_userbuffer_rect_local( - hpx::naming::id_type && event_gid, - hpx::opencl::rect_props && rect_properties, - hpx::serialization::serialize_buffer data, - std::vector && dependencies ){ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - cl_event return_event; - - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); - - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); - - // prepare arguments for OpenCL call - std::size_t buffer_origin[] = { rect_properties.src_x * sizeof(T), - rect_properties.src_y, - rect_properties.src_z }; - std::size_t host_origin[] = { rect_properties.dst_x * sizeof(T), - rect_properties.dst_y, - rect_properties.dst_z }; - std::size_t region[] = { rect_properties.size_x * sizeof(T), - rect_properties.size_y, - rect_properties.size_z }; - - HPX_ASSERT(data.size() > - (rect_properties.size_x + rect_properties.dst_x - 1) - + (rect_properties.size_y + rect_properties.dst_y - 1) - * rect_properties.dst_stride_y - + (rect_properties.size_z + rect_properties.dst_z - 1) - * rect_properties.dst_stride_z ); - - // run the OpenCL-call - err = clEnqueueReadBufferRect( - command_queue, device_mem, CL_FALSE, - buffer_origin, host_origin, region, - rect_properties.src_stride_y * sizeof(T), - rect_properties.src_stride_z * sizeof(T), - rect_properties.dst_stride_y * sizeof(T), - rect_properties.dst_stride_z * sizeof(T), - data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueReadBufferRect()"); - - // register the data to prevent deallocation - parent_device->put_event_data(return_event, data); - - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); - - // arm the future. ! this blocks. - parent_device->activate_deferred_event_with_data(event_gid); - +void hpx::opencl::server::buffer::enqueue_read_to_userbuffer_rect_local( + hpx::naming::id_type &&event_gid, hpx::opencl::rect_props &&rect_properties, + hpx::serialization::serialize_buffer data, + std::vector &&dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + cl_int err; + cl_event return_event; + + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); + + // prepare arguments for OpenCL call + std::size_t buffer_origin[] = {rect_properties.src_x * sizeof(T), + rect_properties.src_y, rect_properties.src_z}; + std::size_t host_origin[] = {rect_properties.dst_x * sizeof(T), + rect_properties.dst_y, rect_properties.dst_z}; + std::size_t region[] = {rect_properties.size_x * sizeof(T), + rect_properties.size_y, rect_properties.size_z}; + + HPX_ASSERT(data.size() > + (rect_properties.size_x + rect_properties.dst_x - 1) + + (rect_properties.size_y + rect_properties.dst_y - 1) * + rect_properties.dst_stride_y + + (rect_properties.size_z + rect_properties.dst_z - 1) * + rect_properties.dst_stride_z); + + // run the OpenCL-call + err = clEnqueueReadBufferRect( + command_queue, device_mem, CL_FALSE, buffer_origin, host_origin, region, + rect_properties.src_stride_y * sizeof(T), + rect_properties.src_stride_z * sizeof(T), + rect_properties.dst_stride_y * sizeof(T), + rect_properties.dst_stride_z * sizeof(T), data.data(), + static_cast(events.size()), events.get_cl_events(), + &return_event); + cl_ensure(err, "clEnqueueReadBufferRect()"); + + // register the data to prevent deallocation + parent_device->put_event_data(return_event, data); + + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); + + // arm the future. ! this blocks. + parent_device->activate_deferred_event_with_data(event_gid); } template -void -hpx::opencl::server::buffer::enqueue_read_to_userbuffer_rect_remote( - hpx::naming::id_type && event_gid, - hpx::opencl::rect_props && rect_properties, +void hpx::opencl::server::buffer::enqueue_read_to_userbuffer_rect_remote( + hpx::naming::id_type &&event_gid, hpx::opencl::rect_props &&rect_properties, std::uintptr_t remote_data_addr, - std::vector && dependencies ){ - - // the general algorithm of the remote rect read is: - // - allocate a buffer that exactly fits the read data - // - read from gpu - // - send the data and extract it to the correct position in the - // remote destination buffer via zero-copy send - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - typedef hpx::serialization::serialize_buffer buffer_type; - - cl_int err; - cl_event return_event; - - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); - - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); - - // create new target buffer - std::size_t dst_size = rect_properties.size_x * rect_properties.size_y - * rect_properties.size_z * sizeof(T); - buffer_type data( dst_size ); - - // prepare arguments for OpenCL call - std::size_t buffer_origin[] = { rect_properties.src_x * sizeof(T), - rect_properties.src_y, - rect_properties.src_z }; - std::size_t host_origin[] = { 0, 0, 0 }; // don't waste space on the host buf - std::size_t region[] = { rect_properties.size_x * sizeof(T), - rect_properties.size_y, - rect_properties.size_z }; - - // run the OpenCL-call - err = clEnqueueReadBufferRect( - command_queue, device_mem, CL_FALSE, - buffer_origin, host_origin, region, - rect_properties.src_stride_y * sizeof(T), - rect_properties.src_stride_z * sizeof(T), - rect_properties.size_x * sizeof(T), - rect_properties.size_x * sizeof(T) * rect_properties.size_y, - data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueReadBufferRect()"); - - // put_event_data not necessary as we locally keep the buffer alive until - // the event triggered - - // also important: the cl_event does not get destroyed inside of - // the event map of parent_device, because we keep the lcos::event - // alive as we have an event_id - - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); - - // prepare a zero-copy buffer - // TODO replace dst_size with rect_properties - hpx::opencl::lcos::zerocopy_buffer zerocopy_buffer( remote_data_addr, - rect_properties, - sizeof(T), - data ); - - // wait for the event to finish - parent_device->wait_for_cl_event(return_event); - - // send the zerocopy_buffer to the lcos::event -// typedef hpx::opencl::lcos::detail::set_zerocopy_data_action -// set_data_func; -// hpx::apply_colocated(event_gid, event_gid, zerocopy_buffer); - - hpx::set_lco_value(event_gid, std::move(zerocopy_buffer)); + std::vector &&dependencies) { + // the general algorithm of the remote rect read is: + // - allocate a buffer that exactly fits the read data + // - read from gpu + // - send the data and extract it to the correct position in the + // remote destination buffer via zero-copy send + + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + typedef hpx::serialization::serialize_buffer buffer_type; + + cl_int err; + cl_event return_event; + + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); + + // create new target buffer + std::size_t dst_size = rect_properties.size_x * rect_properties.size_y * + rect_properties.size_z * sizeof(T); + buffer_type data(dst_size); + + // prepare arguments for OpenCL call + std::size_t buffer_origin[] = {rect_properties.src_x * sizeof(T), + rect_properties.src_y, rect_properties.src_z}; + std::size_t host_origin[] = {0, 0, 0}; // don't waste space on the host buf + std::size_t region[] = {rect_properties.size_x * sizeof(T), + rect_properties.size_y, rect_properties.size_z}; + + // run the OpenCL-call + err = clEnqueueReadBufferRect( + command_queue, device_mem, CL_FALSE, buffer_origin, host_origin, region, + rect_properties.src_stride_y * sizeof(T), + rect_properties.src_stride_z * sizeof(T), + rect_properties.size_x * sizeof(T), + rect_properties.size_x * sizeof(T) * rect_properties.size_y, data.data(), + static_cast(events.size()), events.get_cl_events(), + &return_event); + cl_ensure(err, "clEnqueueReadBufferRect()"); + + // put_event_data not necessary as we locally keep the buffer alive until + // the event triggered + + // also important: the cl_event does not get destroyed inside of + // the event map of parent_device, because we keep the lcos::event + // alive as we have an event_id + + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); + + // prepare a zero-copy buffer + // TODO replace dst_size with rect_properties + hpx::opencl::lcos::zerocopy_buffer zerocopy_buffer( + remote_data_addr, rect_properties, sizeof(T), data); + + // wait for the event to finish + parent_device->wait_for_cl_event(return_event); + + // send the zerocopy_buffer to the lcos::event + // typedef hpx::opencl::lcos::detail::set_zerocopy_data_action + // set_data_func; + // hpx::apply_colocated(event_gid, event_gid, + // zerocopy_buffer); + + hpx::set_lco_value(event_gid, std::move(zerocopy_buffer)); } - - #endif diff --git a/opencl/server/buffer_server.cpp b/opencl/server/buffer_server.cpp index 39cc8ffb..367565b2 100644 --- a/opencl/server/buffer_server.cpp +++ b/opencl/server/buffer_server.cpp @@ -17,536 +17,433 @@ #include #include - using namespace hpx::opencl::server; - // Constructor -buffer::buffer() -{} +buffer::buffer() {} // External destructor. // This is needed because OpenCL calls only run properly on large stack size. -static void buffer_cleanup(uintptr_t device_mem_ptr) -{ - cl_int err; +static void buffer_cleanup(uintptr_t device_mem_ptr) { + cl_int err; - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - cl_mem device_mem = reinterpret_cast(device_mem_ptr); + cl_mem device_mem = reinterpret_cast(device_mem_ptr); - // Release the device memory - if(device_mem) - { - err = clReleaseMemObject(device_mem); - cl_ensure_nothrow(err, "clReleaseMemObject()"); - } + // Release the device memory + if (device_mem) { + err = clReleaseMemObject(device_mem); + cl_ensure_nothrow(err, "clReleaseMemObject()"); + } } // Destructor -buffer::~buffer() -{ - - hpx::threads::executors::default_executor exec( - hpx::threads::thread_priority_normal, - hpx::threads::thread_stacksize_medium); - - // run destructor in a thread, as we need it to run on a large stack size - hpx::threads::async_execute(exec,&buffer_cleanup, reinterpret_cast(device_mem)); - - - +buffer::~buffer() { + hpx::threads::executors::default_executor exec( + hpx::threads::thread_priority_normal, + hpx::threads::thread_stacksize_medium); + + // run destructor in a thread, as we need it to run on a large stack size + hpx::threads::async_execute(exec, &buffer_cleanup, + reinterpret_cast(device_mem)); } // Returns the parent device -hpx::naming::id_type buffer::get_parent_device_id() -{ - return parent_device_id; -} - -void -buffer::init( hpx::naming::id_type device_id, cl_mem_flags flags, - std::size_t size) -{ +hpx::naming::id_type buffer::get_parent_device_id() { return parent_device_id; } - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +void buffer::init(hpx::naming::id_type device_id, cl_mem_flags flags, + std::size_t size) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - this->parent_device_id = std::move(device_id); - this->parent_device = hpx::get_ptr - (parent_device_id).get(); - this->device_mem = NULL; + this->parent_device_id = std::move(device_id); + this->parent_device = + hpx::get_ptr(parent_device_id).get(); + this->device_mem = NULL; - // Retrieve the context from parent class - cl_context context = parent_device->get_context(); + // Retrieve the context from parent class + cl_context context = parent_device->get_context(); - // The opencl error variable - cl_int err; + // The opencl error variable + cl_int err; - // Modify the cl_mem_flags - cl_mem_flags modified_flags = flags &! (CL_MEM_USE_HOST_PTR - | CL_MEM_ALLOC_HOST_PTR - | CL_MEM_COPY_HOST_PTR); - - // Create the Context - device_mem = clCreateBuffer(context, modified_flags, size, NULL, &err); - cl_ensure(err, "clCreateBuffer()"); + // Modify the cl_mem_flags + cl_mem_flags modified_flags = + flags & + !(CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR | CL_MEM_COPY_HOST_PTR); + // Create the Context + device_mem = clCreateBuffer(context, modified_flags, size, NULL, &err); + cl_ensure(err, "clCreateBuffer()"); } // Get Buffer Size -std::size_t -buffer::size() -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +std::size_t buffer::size() { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - std::size_t size; - cl_int err; + std::size_t size; + cl_int err; - // Query size - err = clGetMemObjectInfo(device_mem, CL_MEM_SIZE, sizeof(std::size_t), &size, - NULL); - cl_ensure(err, "clGetMemObjectInfo()"); - - return size; + // Query size + err = clGetMemObjectInfo(device_mem, CL_MEM_SIZE, sizeof(std::size_t), &size, + NULL); + cl_ensure(err, "clGetMemObjectInfo()"); + return size; } -void -buffer::enqueue_read( hpx::naming::id_type && event_gid, - std::size_t offset, - std::size_t size, - std::vector && dependencies ){ +void buffer::enqueue_read(hpx::naming::id_type&& event_gid, std::size_t offset, + std::size_t size, + std::vector&& dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + typedef hpx::serialization::serialize_buffer buffer_type; - typedef hpx::serialization::serialize_buffer buffer_type; + cl_int err; + cl_event return_event; - cl_int err; - cl_event return_event; + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); + // create new target buffer + buffer_type data(size); - // create new target buffer - buffer_type data( size ); + // run the OpenCL-call + err = clEnqueueReadBuffer(command_queue, device_mem, CL_FALSE, offset, + data.size(), data.data(), + static_cast(events.size()), + events.get_cl_events(), &return_event); + cl_ensure(err, "clEnqueueReadBuffer()"); - // run the OpenCL-call - err = clEnqueueReadBuffer( command_queue, device_mem, CL_FALSE, offset, - data.size(), data.data(), - static_cast(events.size()), - events.get_cl_events(), &return_event ); - cl_ensure(err, "clEnqueueReadBuffer()"); + // register the data to prevent deallocation + parent_device->put_event_data(return_event, data); - // register the data to prevent deallocation - parent_device->put_event_data(return_event, data); + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); + // arm the future. ! this blocks. + parent_device->activate_deferred_event_with_data(event_gid); +} - // arm the future. ! this blocks. - parent_device->activate_deferred_event_with_data(event_gid); +void buffer::send_bruteforce( + hpx::naming::id_type&& dst, hpx::naming::id_type&& src_event_gid, + hpx::naming::id_type&& dst_event_gid, std::size_t src_offset, + std::size_t dst_offset, std::size_t size, + std::vector&& src_dependencies, + std::vector&& dst_dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); -} + //////////////////////////////////////////////////////////////////////////// + // Read + // -void -buffer::send_bruteforce( hpx::naming::id_type && dst, - hpx::naming::id_type && src_event_gid, - hpx::naming::id_type && dst_event_gid, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - std::vector && src_dependencies, - std::vector && dst_dependencies) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - //////////////////////////////////////////////////////////////////////////// - // Read - // - - typedef hpx::serialization::serialize_buffer buffer_type; - - cl_int err; - cl_event src_event; - - // retrieve the dependency cl_events - util::event_dependencies events( src_dependencies, parent_device.get() ); - - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); - - // create new target buffer - buffer_type data( size ); - - // run the OpenCL-call - err = clEnqueueReadBuffer( command_queue, device_mem, CL_FALSE, src_offset, - data.size(), data.data(), - static_cast(events.size()), - events.get_cl_events(), &src_event ); - cl_ensure(err, "clEnqueueReadBuffer()"); - - // register the cl_event to the client event - parent_device->register_event(src_event_gid, src_event); - - // wait for clEnqueueReadBuffer to finish - parent_device->wait_for_cl_event(src_event); - - //////////////////////////////////////////////////////////////////////////// - // Write - // - typedef hpx::opencl::server::buffer::enqueue_write_action func; - hpx::apply( std::move(dst), - std::move(dst_event_gid), - dst_offset, - data, - std::move(dst_dependencies) ); -} + typedef hpx::serialization::serialize_buffer buffer_type; -void -buffer::send_direct( hpx::naming::id_type && dst, - std::shared_ptr && dst_buffer, - hpx::naming::id_type && src_event_gid, - hpx::naming::id_type && dst_event_gid, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - std::vector && src_dependencies, - std::vector && dst_dependencies) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - cl_event return_event; - - // gather all dependencies from both devices - std::vector events; - events.reserve(src_dependencies.size() + dst_dependencies.size()); - for(const auto& id : src_dependencies){ - events.push_back( parent_device->retrieve_event(id) ); - } - for(const auto& id : dst_dependencies){ - events.push_back( dst_buffer->parent_device->retrieve_event(id) ); - } + cl_int err; + cl_event src_event; - // Create a pointer that is either a pointer to the data or NULL - cl_event* events_ptr = NULL; - if(!events.empty()){ - events_ptr = events.data(); - } + // retrieve the dependency cl_events + util::event_dependencies events(src_dependencies, parent_device.get()); - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_write_command_queue(); + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); - // run the OpenCL-call - err = clEnqueueCopyBuffer( command_queue, device_mem, dst_buffer->device_mem, - src_offset, dst_offset, size, - static_cast(events.size()), - events_ptr, &return_event ); - cl_ensure(err, "clEnqueueCopyBuffer()"); + // create new target buffer + buffer_type data(size); - // retain event to enable double-registration - err = clRetainEvent( return_event ); - cl_ensure(err, "clRetainEvent()"); + // run the OpenCL-call + err = clEnqueueReadBuffer( + command_queue, device_mem, CL_FALSE, src_offset, data.size(), data.data(), + static_cast(events.size()), events.get_cl_events(), &src_event); + cl_ensure(err, "clEnqueueReadBuffer()"); - // register the cl_event to both client events - this->parent_device->register_event(src_event_gid, return_event); - dst_buffer->parent_device->register_event(dst_event_gid, return_event); + // register the cl_event to the client event + parent_device->register_event(src_event_gid, src_event); -} + // wait for clEnqueueReadBuffer to finish + parent_device->wait_for_cl_event(src_event); -void -buffer::send_rect_bruteforce( hpx::naming::id_type && dst, - hpx::naming::id_type && src_event_gid, - hpx::naming::id_type && dst_event_gid, - hpx::opencl::rect_props && rect_properties, - std::vector && src_dependencies, - std::vector && dst_dependencies) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - //////////////////////////////////////////////////////////////////////////// - // Read - // - - typedef hpx::serialization::serialize_buffer buffer_type; - - cl_int err; - cl_event src_event; - - // retrieve the dependency cl_events - util::event_dependencies events( src_dependencies, parent_device.get() ); - - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_read_command_queue(); - - // create new target buffer - std::size_t dst_size = rect_properties.size_x * rect_properties.size_y - * rect_properties.size_z; - buffer_type data( dst_size ); - - // prepare arguments for OpenCL call - std::size_t buffer_origin[] = { rect_properties.src_x, - rect_properties.src_y, - rect_properties.src_z }; - std::size_t host_origin[] = { 0, 0, 0 }; // don't waste space on the host buf - std::size_t region[] = { rect_properties.size_x, - rect_properties.size_y, - rect_properties.size_z }; - - // run the OpenCL-call - err = clEnqueueReadBufferRect( - command_queue, device_mem, CL_FALSE, - buffer_origin, host_origin, region, - rect_properties.src_stride_y, - rect_properties.src_stride_z, - rect_properties.size_x, - rect_properties.size_x * rect_properties.size_y, - data.data(), - static_cast(events.size()), - events.get_cl_events(), &src_event ); - cl_ensure(err, "clEnqueueReadBufferRect()"); - - // register the cl_event to the client event - parent_device->register_event(src_event_gid, src_event); - - // wait for clEnqueueReadBuffer to finish - parent_device->wait_for_cl_event(src_event); - - //////////////////////////////////////////////////////////////////////////// - // Write - // - - hpx::opencl::rect_props dst_rect_properties ( - 0, 0, 0, - rect_properties.dst_x, - rect_properties.dst_y, - rect_properties.dst_z, - rect_properties.size_x, - rect_properties.size_y, - rect_properties.size_z, - rect_properties.size_x, - rect_properties.size_x * rect_properties.size_y, - rect_properties.dst_stride_y, - rect_properties.dst_stride_z ); - - typedef hpx::opencl::server::buffer::enqueue_write_rect_action func; - hpx::apply( std::move(dst), - std::move(dst_event_gid), - std::move(dst_rect_properties), - data, - std::move(dst_dependencies) ); + //////////////////////////////////////////////////////////////////////////// + // Write + // + typedef hpx::opencl::server::buffer::enqueue_write_action func; + hpx::apply(std::move(dst), std::move(dst_event_gid), dst_offset, data, + std::move(dst_dependencies)); } -void -buffer::send_rect_direct( hpx::naming::id_type && dst, - std::shared_ptr && - dst_buffer, - hpx::naming::id_type && src_event_gid, - hpx::naming::id_type && dst_event_gid, - hpx::opencl::rect_props && rect_properties, - std::vector && src_dependencies, - std::vector && dst_dependencies) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - cl_event return_event; - - // gather all dependencies from both devices - std::vector events; - events.reserve(src_dependencies.size() + dst_dependencies.size()); - for(const auto& id : src_dependencies){ - events.push_back( parent_device->retrieve_event(id) ); - } - for(const auto& id : dst_dependencies){ - events.push_back( dst_buffer->parent_device->retrieve_event(id) ); - } - - // Create a pointer that is either a pointer to the data or NULL - cl_event* events_ptr = NULL; - if(!events.empty()){ - events_ptr = events.data(); - } +void buffer::send_direct( + hpx::naming::id_type&& dst, + std::shared_ptr&& dst_buffer, + hpx::naming::id_type&& src_event_gid, hpx::naming::id_type&& dst_event_gid, + std::size_t src_offset, std::size_t dst_offset, std::size_t size, + std::vector&& src_dependencies, + std::vector&& dst_dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + cl_int err; + cl_event return_event; + + // gather all dependencies from both devices + std::vector events; + events.reserve(src_dependencies.size() + dst_dependencies.size()); + for (const auto& id : src_dependencies) { + events.push_back(parent_device->retrieve_event(id)); + } + for (const auto& id : dst_dependencies) { + events.push_back(dst_buffer->parent_device->retrieve_event(id)); + } + + // Create a pointer that is either a pointer to the data or NULL + cl_event* events_ptr = NULL; + if (!events.empty()) { + events_ptr = events.data(); + } + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_write_command_queue(); + + // run the OpenCL-call + err = clEnqueueCopyBuffer( + command_queue, device_mem, dst_buffer->device_mem, src_offset, dst_offset, + size, static_cast(events.size()), events_ptr, &return_event); + cl_ensure(err, "clEnqueueCopyBuffer()"); + + // retain event to enable double-registration + err = clRetainEvent(return_event); + cl_ensure(err, "clRetainEvent()"); + + // register the cl_event to both client events + this->parent_device->register_event(src_event_gid, return_event); + dst_buffer->parent_device->register_event(dst_event_gid, return_event); +} - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_write_command_queue(); - - // prepare arguments for OpenCL call - std::size_t src_origin[] = { rect_properties.src_x, - rect_properties.src_y, - rect_properties.src_z }; - std::size_t dst_origin[] = { rect_properties.dst_x, - rect_properties.dst_y, - rect_properties.dst_z }; - std::size_t region[] = { rect_properties.size_x, - rect_properties.size_y, - rect_properties.size_z }; - - // run the OpenCL-call - err = clEnqueueCopyBufferRect( command_queue, device_mem, - dst_buffer->device_mem, - src_origin, - dst_origin, - region, - rect_properties.src_stride_y, - rect_properties.src_stride_z, - rect_properties.dst_stride_y, - rect_properties.dst_stride_z, - static_cast(events.size()), - events_ptr, &return_event ); - cl_ensure(err, "clEnqueueCopyBufferRect()"); - - // retain event to enable double-registration - err = clRetainEvent( return_event ); - cl_ensure(err, "clRetainEvent()"); - - // register the cl_event to both client events - this->parent_device->register_event(src_event_gid, return_event); - dst_buffer->parent_device->register_event(dst_event_gid, return_event); +void buffer::send_rect_bruteforce( + hpx::naming::id_type&& dst, hpx::naming::id_type&& src_event_gid, + hpx::naming::id_type&& dst_event_gid, + hpx::opencl::rect_props&& rect_properties, + std::vector&& src_dependencies, + std::vector&& dst_dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + //////////////////////////////////////////////////////////////////////////// + // Read + // + + typedef hpx::serialization::serialize_buffer buffer_type; + + cl_int err; + cl_event src_event; + + // retrieve the dependency cl_events + util::event_dependencies events(src_dependencies, parent_device.get()); + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_read_command_queue(); + + // create new target buffer + std::size_t dst_size = + rect_properties.size_x * rect_properties.size_y * rect_properties.size_z; + buffer_type data(dst_size); + + // prepare arguments for OpenCL call + std::size_t buffer_origin[] = {rect_properties.src_x, rect_properties.src_y, + rect_properties.src_z}; + std::size_t host_origin[] = {0, 0, 0}; // don't waste space on the host buf + std::size_t region[] = {rect_properties.size_x, rect_properties.size_y, + rect_properties.size_z}; + + // run the OpenCL-call + err = clEnqueueReadBufferRect( + command_queue, device_mem, CL_FALSE, buffer_origin, host_origin, region, + rect_properties.src_stride_y, rect_properties.src_stride_z, + rect_properties.size_x, rect_properties.size_x * rect_properties.size_y, + data.data(), static_cast(events.size()), events.get_cl_events(), + &src_event); + cl_ensure(err, "clEnqueueReadBufferRect()"); + + // register the cl_event to the client event + parent_device->register_event(src_event_gid, src_event); + + // wait for clEnqueueReadBuffer to finish + parent_device->wait_for_cl_event(src_event); + + //////////////////////////////////////////////////////////////////////////// + // Write + // + + hpx::opencl::rect_props dst_rect_properties( + 0, 0, 0, rect_properties.dst_x, rect_properties.dst_y, + rect_properties.dst_z, rect_properties.size_x, rect_properties.size_y, + rect_properties.size_z, rect_properties.size_x, + rect_properties.size_x * rect_properties.size_y, + rect_properties.dst_stride_y, rect_properties.dst_stride_z); + + typedef hpx::opencl::server::buffer::enqueue_write_rect_action func; + hpx::apply(std::move(dst), std::move(dst_event_gid), + std::move(dst_rect_properties), data, + std::move(dst_dependencies)); +} +void buffer::send_rect_direct( + hpx::naming::id_type&& dst, + std::shared_ptr&& dst_buffer, + hpx::naming::id_type&& src_event_gid, hpx::naming::id_type&& dst_event_gid, + hpx::opencl::rect_props&& rect_properties, + std::vector&& src_dependencies, + std::vector&& dst_dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + cl_int err; + cl_event return_event; + + // gather all dependencies from both devices + std::vector events; + events.reserve(src_dependencies.size() + dst_dependencies.size()); + for (const auto& id : src_dependencies) { + events.push_back(parent_device->retrieve_event(id)); + } + for (const auto& id : dst_dependencies) { + events.push_back(dst_buffer->parent_device->retrieve_event(id)); + } + + // Create a pointer that is either a pointer to the data or NULL + cl_event* events_ptr = NULL; + if (!events.empty()) { + events_ptr = events.data(); + } + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_write_command_queue(); + + // prepare arguments for OpenCL call + std::size_t src_origin[] = {rect_properties.src_x, rect_properties.src_y, + rect_properties.src_z}; + std::size_t dst_origin[] = {rect_properties.dst_x, rect_properties.dst_y, + rect_properties.dst_z}; + std::size_t region[] = {rect_properties.size_x, rect_properties.size_y, + rect_properties.size_z}; + + // run the OpenCL-call + err = clEnqueueCopyBufferRect( + command_queue, device_mem, dst_buffer->device_mem, src_origin, dst_origin, + region, rect_properties.src_stride_y, rect_properties.src_stride_z, + rect_properties.dst_stride_y, rect_properties.dst_stride_z, + static_cast(events.size()), events_ptr, &return_event); + cl_ensure(err, "clEnqueueCopyBufferRect()"); + + // retain event to enable double-registration + err = clRetainEvent(return_event); + cl_ensure(err, "clRetainEvent()"); + + // register the cl_event to both client events + this->parent_device->register_event(src_event_gid, return_event); + dst_buffer->parent_device->register_event(dst_event_gid, return_event); } -void -buffer::enqueue_send_rect( hpx::naming::id_type dst, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - hpx::opencl::rect_props rect_properties, - std::vector && dependencies, - std::vector && - dependency_devices ) -{ - - HPX_ASSERT(dependencies.size() == dependency_devices.size()); - - // query the location of the destination - auto dst_location_future = hpx::get_colocation_id(dst); - - // split between src_dependencies and dst_dependencies - std::vector src_dependencies; - std::vector dst_dependencies; - hpx::naming::gid_type src_device = parent_device_id.get_gid(); - std::vector::iterator it = dependencies.begin(); - for(const auto& device : dependency_devices){ - if(device == src_device){ - std::move(it, it+1, std::back_inserter(src_dependencies)); - } else { - std::move(it, it+1, std::back_inserter(dst_dependencies)); - } - it++; +void buffer::enqueue_send_rect( + hpx::naming::id_type dst, hpx::naming::id_type&& src_event, + hpx::naming::id_type&& dst_event, hpx::opencl::rect_props rect_properties, + std::vector&& dependencies, + std::vector&& dependency_devices) { + HPX_ASSERT(dependencies.size() == dependency_devices.size()); + + // query the location of the destination + auto dst_location_future = hpx::get_colocation_id(dst); + + // split between src_dependencies and dst_dependencies + std::vector src_dependencies; + std::vector dst_dependencies; + hpx::naming::gid_type src_device = parent_device_id.get_gid(); + std::vector::iterator it = dependencies.begin(); + for (const auto& device : dependency_devices) { + if (device == src_device) { + std::move(it, it + 1, std::back_inserter(src_dependencies)); + } else { + std::move(it, it + 1, std::back_inserter(dst_dependencies)); } - - // get the location of the destination - hpx::naming::id_type dst_location = dst_location_future.get(); - hpx::naming::id_type src_location = hpx::find_here(); - - // choose which function to run - // optimization for context internal copies - if(dst_location == src_location){ - auto dst_buffer = hpx::get_ptr(dst).get(); - - cl_context src_context = this->parent_device->get_context(); - cl_context dst_context = dst_buffer->parent_device->get_context(); - - if(src_context == dst_context){ - send_rect_direct( std::move(dst), - std::move(dst_buffer), - std::move(src_event), - std::move(dst_event), - std::move(rect_properties), - std::move(src_dependencies), - std::move(dst_dependencies) ); - return; - } + it++; + } + + // get the location of the destination + hpx::naming::id_type dst_location = dst_location_future.get(); + hpx::naming::id_type src_location = hpx::find_here(); + + // choose which function to run + // optimization for context internal copies + if (dst_location == src_location) { + auto dst_buffer = hpx::get_ptr(dst).get(); + + cl_context src_context = this->parent_device->get_context(); + cl_context dst_context = dst_buffer->parent_device->get_context(); + + if (src_context == dst_context) { + send_rect_direct(std::move(dst), std::move(dst_buffer), + std::move(src_event), std::move(dst_event), + std::move(rect_properties), std::move(src_dependencies), + std::move(dst_dependencies)); + return; } + } - // Always works: the bruteforce method - send_rect_bruteforce( std::move(dst), - std::move(src_event), - std::move(dst_event), - std::move(rect_properties), - std::move(src_dependencies), - std::move(dst_dependencies) ); - + // Always works: the bruteforce method + send_rect_bruteforce(std::move(dst), std::move(src_event), + std::move(dst_event), std::move(rect_properties), + std::move(src_dependencies), + std::move(dst_dependencies)); } -void -buffer::enqueue_send( hpx::naming::id_type dst, - hpx::naming::id_type && src_event, - hpx::naming::id_type && dst_event, - std::size_t src_offset, - std::size_t dst_offset, - std::size_t size, - std::vector && dependencies, - std::vector && dependency_devices ) -{ - - HPX_ASSERT(dependencies.size() == dependency_devices.size()); - - // query the location of the destination - auto dst_location_future = hpx::get_colocation_id(dst); - - // split between src_dependencies and dst_dependencies - std::vector src_dependencies; - std::vector dst_dependencies; - hpx::naming::gid_type src_device = parent_device_id.get_gid(); - std::vector::iterator it = dependencies.begin(); - for(const auto& device : dependency_devices){ - if(device == src_device){ - std::move(it, it+1, std::back_inserter(src_dependencies)); - } else { - std::move(it, it+1, std::back_inserter(dst_dependencies)); - } - it++; +void buffer::enqueue_send( + hpx::naming::id_type dst, hpx::naming::id_type&& src_event, + hpx::naming::id_type&& dst_event, std::size_t src_offset, + std::size_t dst_offset, std::size_t size, + std::vector&& dependencies, + std::vector&& dependency_devices) { + HPX_ASSERT(dependencies.size() == dependency_devices.size()); + + // query the location of the destination + auto dst_location_future = hpx::get_colocation_id(dst); + + // split between src_dependencies and dst_dependencies + std::vector src_dependencies; + std::vector dst_dependencies; + hpx::naming::gid_type src_device = parent_device_id.get_gid(); + std::vector::iterator it = dependencies.begin(); + for (const auto& device : dependency_devices) { + if (device == src_device) { + std::move(it, it + 1, std::back_inserter(src_dependencies)); + } else { + std::move(it, it + 1, std::back_inserter(dst_dependencies)); } - - // get the location of the destination - hpx::naming::id_type dst_location = dst_location_future.get(); - hpx::naming::id_type src_location = hpx::find_here(); - - // choose which function to run - // optimization for context internal copies - if(dst_location == src_location){ - auto dst_buffer = hpx::get_ptr(dst).get(); - - cl_context src_context = this->parent_device->get_context(); - cl_context dst_context = dst_buffer->parent_device->get_context(); - - if(src_context == dst_context){ - send_direct( std::move(dst), - std::move(dst_buffer), - std::move(src_event), - std::move(dst_event), - src_offset, - dst_offset, - size, - std::move(src_dependencies), - std::move(dst_dependencies) ); - return; - } + it++; + } + + // get the location of the destination + hpx::naming::id_type dst_location = dst_location_future.get(); + hpx::naming::id_type src_location = hpx::find_here(); + + // choose which function to run + // optimization for context internal copies + if (dst_location == src_location) { + auto dst_buffer = hpx::get_ptr(dst).get(); + + cl_context src_context = this->parent_device->get_context(); + cl_context dst_context = dst_buffer->parent_device->get_context(); + + if (src_context == dst_context) { + send_direct(std::move(dst), std::move(dst_buffer), std::move(src_event), + std::move(dst_event), src_offset, dst_offset, size, + std::move(src_dependencies), std::move(dst_dependencies)); + return; } + } - // Always works: the bruteforce method - send_bruteforce( std::move(dst), - std::move(src_event), - std::move(dst_event), - src_offset, - dst_offset, - size, - std::move(src_dependencies), - std::move(dst_dependencies) ); - + // Always works: the bruteforce method + send_bruteforce(std::move(dst), std::move(src_event), std::move(dst_event), + src_offset, dst_offset, size, std::move(src_dependencies), + std::move(dst_dependencies)); } -cl_mem -buffer::get_cl_mem() -{ - return device_mem; -} +cl_mem buffer::get_cl_mem() { return device_mem; } diff --git a/opencl/server/create_devices.hpp b/opencl/server/create_devices.hpp index c4e57a0f..7d78b7d2 100644 --- a/opencl/server/create_devices.hpp +++ b/opencl/server/create_devices.hpp @@ -16,26 +16,28 @@ #include "../device.hpp" //////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl { namespace server { +namespace hpx { +namespace opencl { +namespace server { +// ///////////////////////////////////////////////////// +// Global opencl functions +// - // ///////////////////////////////////////////////////// - // Global opencl functions - // - - // Returns the IDs of all devices on current host - std::vector - create_devices(cl_device_type, std::string cl_version); +// Returns the IDs of all devices on current host +std::vector create_devices(cl_device_type, + std::string cl_version); - //[opencl_management_action_types - HPX_DEFINE_PLAIN_ACTION(create_devices, create_devices_action); - //] +//[opencl_management_action_types +HPX_DEFINE_PLAIN_ACTION(create_devices, create_devices_action); +//] -}}} +} // namespace server +} // namespace opencl +} // namespace hpx HPX_ACTION_USES_MEDIUM_STACK(hpx::opencl::server::create_devices_action); HPX_REGISTER_ACTION_DECLARATION(hpx::opencl::server::create_devices_action, hpx_opencl_server_create_devices_action) #endif - diff --git a/opencl/server/create_devices_server.cpp b/opencl/server/create_devices_server.cpp index 29446a3a..d47485ed 100644 --- a/opencl/server/create_devices_server.cpp +++ b/opencl/server/create_devices_server.cpp @@ -17,165 +17,144 @@ #include #include - /////////////////////////////////////////////////// /// Local functions /// -static std::vector -parse_version_string(std::string version_str) -{ - - try { - - // Make sure the version string starts with "OpenCL " - HPX_ASSERT(version_str.compare(0, 7, "OpenCL ") == 0); - - // Cut away the "OpenCL " in front of the version string - version_str = version_str.substr(7); - - // Cut away everything behind the version number - version_str = version_str.substr(0, version_str.find(" ")); - - // Get major version string - std::string version_str_major = - version_str.substr(0, version_str.find(".")); - - // Get minor version string - std::string version_str_minor = - version_str.substr(version_str_major.size() + 1); - - // create output vector - std::vector version_numbers(2); - - // Parse version number - version_numbers[0] = ::atoi(version_str_major.c_str()); - version_numbers[1] = ::atoi(version_str_minor.c_str()); - - // Return the parsed version number - return version_numbers; - - } catch (const std::exception &) { - hpx::cerr << "Error while parsing OpenCL Version!" << hpx::endl; - std::vector version_numbers(2); - version_numbers[0] = -1; - version_numbers[1] = 0; - return version_numbers; - } - +static std::vector parse_version_string(std::string version_str) { + try { + // Make sure the version string starts with "OpenCL " + HPX_ASSERT(version_str.compare(0, 7, "OpenCL ") == 0); + + // Cut away the "OpenCL " in front of the version string + version_str = version_str.substr(7); + + // Cut away everything behind the version number + version_str = version_str.substr(0, version_str.find(" ")); + + // Get major version string + std::string version_str_major = + version_str.substr(0, version_str.find(".")); + + // Get minor version string + std::string version_str_minor = + version_str.substr(version_str_major.size() + 1); + + // create output vector + std::vector version_numbers(2); + + // Parse version number + version_numbers[0] = ::atoi(version_str_major.c_str()); + version_numbers[1] = ::atoi(version_str_minor.c_str()); + + // Return the parsed version number + return version_numbers; + + } catch (const std::exception &) { + hpx::cerr << "Error while parsing OpenCL Version!" << hpx::endl; + std::vector version_numbers(2); + version_numbers[0] = -1; + version_numbers[1] = 0; + return version_numbers; + } } - /////////////////////////////////////////////////// /// Implementations /// // This method initializes the devices-list if it's not done yet. -std::vector -hpx::opencl::server::create_devices(cl_device_type device_type, - std::string min_cl_version) -{ - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - // Parse required OpenCL version - std::vector required_version = parse_version_string(min_cl_version); - - // Initialize device list - std::vector devices; - - // Declaire the cl error code variable - cl_int err; - - // Query for number of available platforms - cl_uint num_platforms; - err = clGetPlatformIDs(0, NULL, &num_platforms); - cl_ensure(err, "clGetPlatformIDs()"); - - // Retrieve platforms - std::vector platforms(num_platforms); - err = clGetPlatformIDs(num_platforms, platforms.data(), NULL); - cl_ensure(err, "clGetPlatformIDs()"); - - // Search on every platform - for(const auto &platform : platforms) - { - // Query for number of available devices - cl_uint num_devices_on_platform; - err = clGetDeviceIDs( platform, device_type, 0, NULL, - &num_devices_on_platform ); - if(err == CL_DEVICE_NOT_FOUND) continue; - cl_ensure(err, "clGetDeviceIDs()"); - - // Retrieve devices - std::vector devices_on_platform(num_devices_on_platform); - err = clGetDeviceIDs( platform, device_type, - num_devices_on_platform, - devices_on_platform.data(), NULL ); - cl_ensure(err, "clGetDeviceIDs()"); - - - // Add devices_on_platform to devices - for(const auto & device : devices_on_platform) - { - - // Get OpenCL Version string length - std::size_t version_string_length; - err = clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, - &version_string_length); - cl_ensure(err, "clGetDeviceInfo()"); - - // Get OpenCL Version string - std::vector version_string_arr(version_string_length); - err = clGetDeviceInfo(device, CL_DEVICE_VERSION, - version_string_length, - version_string_arr.data(), - NULL); - cl_ensure(err, "clGetDeviceInfo()"); - - // Convert to std::string - std::size_t length = 0; - while(length < version_string_arr.size()) - { - if(version_string_arr[length] == '\0') break; - length++; - } - std::string version_string(version_string_arr.begin(), - version_string_arr.begin() + length); - - // Parse - std::vector version = parse_version_string(version_string); - - #ifndef HPXCL_ALLOW_OPENCL_1_0_DEVICES - - // only allow machines with version 1.1 or higher - if(version[0] < 1) continue; - if(version[0] == 1 && version[1] < 1) continue; - - #endif //HPXCL_ALLOW_OPENCL_1_0_DEVICES - - // Check if device supports required version - if(version[0] < required_version[0]) continue; - if(version[0] == required_version[0]) - { - if(version[1] < required_version[1]) continue; - } - - // Create a new device client - hpx::opencl::device device_client( - hpx::components::new_( - hpx::find_here())); - - // Initialize device server locally - std::shared_ptr device_server = - hpx::get_ptr - (device_client.get_id()).get(); - device_server->init(device); - - // Add device to list of valid devices - devices.push_back(device_client); - } +std::vector hpx::opencl::server::create_devices( + cl_device_type device_type, std::string min_cl_version) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + // Parse required OpenCL version + std::vector required_version = parse_version_string(min_cl_version); + + // Initialize device list + std::vector devices; + + // Declaire the cl error code variable + cl_int err; + + // Query for number of available platforms + cl_uint num_platforms; + err = clGetPlatformIDs(0, NULL, &num_platforms); + cl_ensure(err, "clGetPlatformIDs()"); + + // Retrieve platforms + std::vector platforms(num_platforms); + err = clGetPlatformIDs(num_platforms, platforms.data(), NULL); + cl_ensure(err, "clGetPlatformIDs()"); + + // Search on every platform + for (const auto &platform : platforms) { + // Query for number of available devices + cl_uint num_devices_on_platform; + err = clGetDeviceIDs(platform, device_type, 0, NULL, + &num_devices_on_platform); + if (err == CL_DEVICE_NOT_FOUND) continue; + cl_ensure(err, "clGetDeviceIDs()"); + + // Retrieve devices + std::vector devices_on_platform(num_devices_on_platform); + err = clGetDeviceIDs(platform, device_type, num_devices_on_platform, + devices_on_platform.data(), NULL); + cl_ensure(err, "clGetDeviceIDs()"); + + // Add devices_on_platform to devices + for (const auto &device : devices_on_platform) { + // Get OpenCL Version string length + std::size_t version_string_length; + err = clGetDeviceInfo(device, CL_DEVICE_VERSION, 0, NULL, + &version_string_length); + cl_ensure(err, "clGetDeviceInfo()"); + + // Get OpenCL Version string + std::vector version_string_arr(version_string_length); + err = clGetDeviceInfo(device, CL_DEVICE_VERSION, version_string_length, + version_string_arr.data(), NULL); + cl_ensure(err, "clGetDeviceInfo()"); + + // Convert to std::string + std::size_t length = 0; + while (length < version_string_arr.size()) { + if (version_string_arr[length] == '\0') break; + length++; + } + std::string version_string(version_string_arr.begin(), + version_string_arr.begin() + length); + + // Parse + std::vector version = parse_version_string(version_string); + +#ifndef HPXCL_ALLOW_OPENCL_1_0_DEVICES + + // only allow machines with version 1.1 or higher + if (version[0] < 1) continue; + if (version[0] == 1 && version[1] < 1) continue; + +#endif // HPXCL_ALLOW_OPENCL_1_0_DEVICES + + // Check if device supports required version + if (version[0] < required_version[0]) continue; + if (version[0] == required_version[0]) { + if (version[1] < required_version[1]) continue; + } + + // Create a new device client + hpx::opencl::device device_client( + hpx::components::new_(hpx::find_here())); + + // Initialize device server locally + std::shared_ptr device_server = + hpx::get_ptr(device_client.get_id()) + .get(); + device_server->init(device); + + // Add device to list of valid devices + devices.push_back(device_client); } + } - return devices; + return devices; } - - diff --git a/opencl/server/device.hpp b/opencl/server/device.hpp index d317942c..4aefa690 100644 --- a/opencl/server/device.hpp +++ b/opencl/server/device.hpp @@ -22,138 +22,132 @@ #include "util/server_definitions.hpp" //////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl{ namespace server{ - - - // ///////////////////////////////////////////////////// - // This class represents an OpenCL accelerator device. - // - class HPX_OPENCL_EXPORT device - : public hpx::components::managed_component_base - { - typedef hpx::lcos::local::spinlock lock_type; - typedef hpx::serialization::serialize_buffer buffer_type; - - public: - // Constructor - device(); - ~device(); - - ////////////////////////////////////////////////// - /// Local public functions - /// - void init(cl_device_id device_id, bool enable_profiling=false); - - cl_context get_context(); - cl_device_id get_device_id(); - - ////////////////////////////////////////////////// - /// Exposed functionality of this component - /// - - // returns device specific information - hpx::serialization::serialize_buffer - get_device_info(cl_device_info info_type); - - // returns platform specific information - hpx::serialization::serialize_buffer - get_platform_info(cl_platform_info info_type); - - // creates a new buffer - hpx::id_type - create_buffer(cl_mem_flags flags, std::size_t size); - - // creates a new program from source - hpx::id_type - create_program_with_source(hpx::serialization::serialize_buffer); - - // creates a new program from binary - hpx::id_type - create_program_with_binary(hpx::serialization::serialize_buffer); - - ///////////////////////////////////////////////// - /// Behind-the-scenes functionality of this component - /// - - // releases an event registered to a GID - void - release_event(hpx::naming::gid_type); - - // activates a deferred event - void - activate_deferred_event(hpx::naming::id_type); - - // activates a deferred event > - void - activate_deferred_event_with_data(hpx::naming::id_type); - - HPX_DEFINE_COMPONENT_ACTION(device, get_device_info); - HPX_DEFINE_COMPONENT_ACTION(device, get_platform_info); - HPX_DEFINE_COMPONENT_ACTION(device, create_buffer); - HPX_DEFINE_COMPONENT_ACTION(device, create_program_with_source); - HPX_DEFINE_COMPONENT_ACTION(device, create_program_with_binary); - HPX_DEFINE_COMPONENT_ACTION(device, release_event); - HPX_DEFINE_COMPONENT_ACTION(device, activate_deferred_event); - - public: - ///////////////////////////////////////////////// - // Public Member Functions - - // registers an event-GID pair - void - register_event(const hpx::naming::id_type & gid, cl_event); - - // retrieves an event from a GID - cl_event - retrieve_event(const hpx::naming::id_type & gid); - - // command queue retrievals - cl_command_queue get_read_command_queue(); - cl_command_queue get_write_command_queue(); - cl_command_queue get_kernel_command_queue(); - - // event data handling. needed to keep clEnqueue* data alive - // (like clEnqueueWriteBuffer) - template - void put_event_data( cl_event event, - hpx::serialization::serialize_buffer data ) - { - event_data_map.add(event, data); - } - - // Waits for an opencl event. - // Necessary to offload wait from hpx to os thread. - void wait_for_cl_event(cl_event); - - private: - /////////////////////////////////////////////// - // Private Member Functions - // - - // Error Callback - static void CL_CALLBACK error_callback(const char*, const void*, - std::size_t, void*); - - // cl_event Deletion Callback - static void delete_event(cl_event); - - // Releases the data that was being kept alive - void delete_event_data(cl_event); - - private: - /////////////////////////////////////////////// - // Private Member Variables - // - cl_device_id device_id; - cl_platform_id platform_id; - cl_context context; - cl_command_queue command_queue; - - util::event_map event_map; - util::data_map event_data_map; - - }; -}}} +namespace hpx { +namespace opencl { +namespace server { + +// ///////////////////////////////////////////////////// +// This class represents an OpenCL accelerator device. +// +class HPX_OPENCL_EXPORT device + : public hpx::components::managed_component_base { + typedef hpx::lcos::local::spinlock lock_type; + typedef hpx::serialization::serialize_buffer buffer_type; + + public: + // Constructor + device(); + ~device(); + + ////////////////////////////////////////////////// + /// Local public functions + /// + void init(cl_device_id device_id, bool enable_profiling = false); + + cl_context get_context(); + cl_device_id get_device_id(); + + ////////////////////////////////////////////////// + /// Exposed functionality of this component + /// + + // returns device specific information + hpx::serialization::serialize_buffer get_device_info( + cl_device_info info_type); + + // returns platform specific information + hpx::serialization::serialize_buffer get_platform_info( + cl_platform_info info_type); + + // creates a new buffer + hpx::id_type create_buffer(cl_mem_flags flags, std::size_t size); + + // creates a new program from source + hpx::id_type create_program_with_source( + hpx::serialization::serialize_buffer); + + // creates a new program from binary + hpx::id_type create_program_with_binary( + hpx::serialization::serialize_buffer); + + ///////////////////////////////////////////////// + /// Behind-the-scenes functionality of this component + /// + + // releases an event registered to a GID + void release_event(hpx::naming::gid_type); + + // activates a deferred event + void activate_deferred_event(hpx::naming::id_type); + + // activates a deferred event > + void activate_deferred_event_with_data(hpx::naming::id_type); + + HPX_DEFINE_COMPONENT_ACTION(device, get_device_info); + HPX_DEFINE_COMPONENT_ACTION(device, get_platform_info); + HPX_DEFINE_COMPONENT_ACTION(device, create_buffer); + HPX_DEFINE_COMPONENT_ACTION(device, create_program_with_source); + HPX_DEFINE_COMPONENT_ACTION(device, create_program_with_binary); + HPX_DEFINE_COMPONENT_ACTION(device, release_event); + HPX_DEFINE_COMPONENT_ACTION(device, activate_deferred_event); + + public: + ///////////////////////////////////////////////// + // Public Member Functions + + // registers an event-GID pair + void register_event(const hpx::naming::id_type& gid, cl_event); + + // retrieves an event from a GID + cl_event retrieve_event(const hpx::naming::id_type& gid); + + // command queue retrievals + cl_command_queue get_read_command_queue(); + cl_command_queue get_write_command_queue(); + cl_command_queue get_kernel_command_queue(); + + // event data handling. needed to keep clEnqueue* data alive + // (like clEnqueueWriteBuffer) + template + void put_event_data(cl_event event, + hpx::serialization::serialize_buffer data) { + event_data_map.add(event, data); + } + + // Waits for an opencl event. + // Necessary to offload wait from hpx to os thread. + void wait_for_cl_event(cl_event); + + private: + /////////////////////////////////////////////// + // Private Member Functions + // + + // Error Callback + static void CL_CALLBACK error_callback(const char*, const void*, std::size_t, + void*); + + // cl_event Deletion Callback + static void delete_event(cl_event); + + // Releases the data that was being kept alive + void delete_event_data(cl_event); + + private: + /////////////////////////////////////////////// + // Private Member Variables + // + cl_device_id device_id; + cl_platform_id platform_id; + cl_context context; + cl_command_queue command_queue; + + util::event_map event_map; + util::data_map event_data_map; +}; +} // namespace server +} // namespace opencl +} // namespace hpx //[opencl_management_registration_declarations HPX_OPENCL_REGISTER_ACTION_DECLARATION(device, get_device_info); diff --git a/opencl/server/device_server.cpp b/opencl/server/device_server.cpp index 2cbd74a6..a96b2cc8 100644 --- a/opencl/server/device_server.cpp +++ b/opencl/server/device_server.cpp @@ -19,400 +19,308 @@ using namespace hpx::opencl::server; - // Constructor -device::device() -{ - // Register the event deletion callback function at the event map - event_map.register_deletion_callback(&delete_event); +device::device() { + // Register the event deletion callback function at the event map + event_map.register_deletion_callback(&delete_event); } // External destructor. // This is needed because OpenCL calls only run properly on large stack size. -static void device_cleanup(uintptr_t command_queue_ptr, - uintptr_t context_ptr) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - - cl_command_queue command_queue = reinterpret_cast(command_queue_ptr); - cl_context context = reinterpret_cast(context_ptr); - - // Release command queue - if(command_queue) - { - err = clFinish(command_queue); - cl_ensure_nothrow(err, "clFinish()"); - err = clReleaseCommandQueue(command_queue); - cl_ensure_nothrow(err, "clReleaseCommandQueue()"); - command_queue = NULL; - } - - // Release context - if(context) - { - err = clReleaseContext(context); - cl_ensure_nothrow(err, "clReleaseContext()"); - context = NULL; - } - +static void device_cleanup(uintptr_t command_queue_ptr, uintptr_t context_ptr) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + cl_int err; + + cl_command_queue command_queue = + reinterpret_cast(command_queue_ptr); + cl_context context = reinterpret_cast(context_ptr); + + // Release command queue + if (command_queue) { + err = clFinish(command_queue); + cl_ensure_nothrow(err, "clFinish()"); + err = clReleaseCommandQueue(command_queue); + cl_ensure_nothrow(err, "clReleaseCommandQueue()"); + command_queue = NULL; + } + + // Release context + if (context) { + err = clReleaseContext(context); + cl_ensure_nothrow(err, "clReleaseContext()"); + context = NULL; + } } // Destructor -device::~device() -{ - - hpx::threads::executors::default_executor exec( - hpx::threads::thread_priority_normal, - hpx::threads::thread_stacksize_medium); - - // run dectructor in a thread, as we need it to run on a large stack size - hpx::threads::async_execute( exec, &device_cleanup, (uintptr_t)command_queue, - (uintptr_t)context).wait(); - +device::~device() { + hpx::threads::executors::default_executor exec( + hpx::threads::thread_priority_normal, + hpx::threads::thread_stacksize_medium); + + // run dectructor in a thread, as we need it to run on a large stack size + hpx::threads::async_execute(exec, &device_cleanup, (uintptr_t)command_queue, + (uintptr_t)context) + .wait(); } // Initialization function. // Needed because cl_device_id can not be serialized. -void -device::init(cl_device_id _device_id, bool enable_profiling) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - this->device_id = _device_id; - - cl_int err; - - // Retrieve platformID - err = clGetDeviceInfo(this->device_id, CL_DEVICE_PLATFORM, - sizeof(platform_id), &platform_id, NULL); - cl_ensure(err, "clGetDeviceInfo()"); - - // Create Context - cl_context_properties context_properties[] = - {CL_CONTEXT_PLATFORM, - (cl_context_properties) platform_id, - 0}; - context = clCreateContext(context_properties, - 1, - &this->device_id, - error_callback, - this, - &err); - cl_ensure(err, "clCreateContext()"); - - // Get supported device queue properties - hpx::serialization::serialize_buffer supported_queue_properties_data = - get_device_info(CL_DEVICE_QUEUE_PROPERTIES); - cl_command_queue_properties supported_queue_properties = - *( reinterpret_cast( - supported_queue_properties_data.data())); - - // Initialize command queue properties - cl_command_queue_properties command_queue_properties = 0; - - // If supported, add OUT_OF_ORDER_EXEC_MODE - if(supported_queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) - command_queue_properties |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; - - // If supported and wanted, add PROFILING - if(enable_profiling && - (supported_queue_properties & CL_QUEUE_PROFILING_ENABLE)) - command_queue_properties |= CL_QUEUE_PROFILING_ENABLE; - - // Create Command Queue - #ifdef CL_VERSION_2_0 - cl_queue_properties queue_properties[] = {CL_QUEUE_PROPERTIES, - (cl_queue_properties) command_queue_properties, - (cl_queue_properties) 0}; - command_queue = clCreateCommandQueueWithProperties(context, device_id, - queue_properties, &err); - cl_ensure(err, "clCreateCommandQueueWithProperties()"); - #else - command_queue = clCreateCommandQueue(context, device_id, - command_queue_properties, &err); - cl_ensure(err, "clCreateCommandQueue()"); - #endif +void device::init(cl_device_id _device_id, bool enable_profiling) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + this->device_id = _device_id; + + cl_int err; + + // Retrieve platformID + err = clGetDeviceInfo(this->device_id, CL_DEVICE_PLATFORM, + sizeof(platform_id), &platform_id, NULL); + cl_ensure(err, "clGetDeviceInfo()"); + + // Create Context + cl_context_properties context_properties[] = { + CL_CONTEXT_PLATFORM, (cl_context_properties)platform_id, 0}; + context = clCreateContext(context_properties, 1, &this->device_id, + error_callback, this, &err); + cl_ensure(err, "clCreateContext()"); + + // Get supported device queue properties + hpx::serialization::serialize_buffer supported_queue_properties_data = + get_device_info(CL_DEVICE_QUEUE_PROPERTIES); + cl_command_queue_properties supported_queue_properties = + *(reinterpret_cast( + supported_queue_properties_data.data())); + + // Initialize command queue properties + cl_command_queue_properties command_queue_properties = 0; + + // If supported, add OUT_OF_ORDER_EXEC_MODE + if (supported_queue_properties & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) + command_queue_properties |= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; + + // If supported and wanted, add PROFILING + if (enable_profiling && + (supported_queue_properties & CL_QUEUE_PROFILING_ENABLE)) + command_queue_properties |= CL_QUEUE_PROFILING_ENABLE; + +// Create Command Queue +#ifdef CL_VERSION_2_0 + cl_queue_properties queue_properties[] = { + CL_QUEUE_PROPERTIES, (cl_queue_properties)command_queue_properties, + (cl_queue_properties)0}; + command_queue = clCreateCommandQueueWithProperties(context, device_id, + queue_properties, &err); + cl_ensure(err, "clCreateCommandQueueWithProperties()"); +#else + command_queue = + clCreateCommandQueue(context, device_id, command_queue_properties, &err); + cl_ensure(err, "clCreateCommandQueue()"); +#endif } +hpx::serialization::serialize_buffer device::get_device_info( + cl_device_info info_type) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); -hpx::serialization::serialize_buffer -device::get_device_info(cl_device_info info_type) -{ + // Declairing the cl error code variable + cl_int err; - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + // Query for size + std::size_t param_size; + err = clGetDeviceInfo(device_id, info_type, 0, NULL, ¶m_size); + cl_ensure(err, "clGetDeviceInfo()"); - // Declairing the cl error code variable - cl_int err; - - // Query for size - std::size_t param_size; - err = clGetDeviceInfo(device_id, info_type, 0, NULL, ¶m_size); - cl_ensure(err, "clGetDeviceInfo()"); - - // Retrieve - hpx::serialization::serialize_buffer info( param_size ); - err = clGetDeviceInfo(device_id, info_type, info.size(), info.data(), 0); - cl_ensure(err, "clGetDeviceInfo()"); - - // Return - return info; + // Retrieve + hpx::serialization::serialize_buffer info(param_size); + err = clGetDeviceInfo(device_id, info_type, info.size(), info.data(), 0); + cl_ensure(err, "clGetDeviceInfo()"); + // Return + return info; } +hpx::serialization::serialize_buffer device::get_platform_info( + cl_platform_info info_type) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); -hpx::serialization::serialize_buffer -device::get_platform_info(cl_platform_info info_type) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - // Declairing the cl error code variable - cl_int err; + // Declairing the cl error code variable + cl_int err; - // Query for size - std::size_t param_size; - err = clGetPlatformInfo(platform_id, info_type, 0, NULL, ¶m_size); - cl_ensure(err, "clGetPlatformInfo()"); + // Query for size + std::size_t param_size; + err = clGetPlatformInfo(platform_id, info_type, 0, NULL, ¶m_size); + cl_ensure(err, "clGetPlatformInfo()"); - // Retrieve - hpx::serialization::serialize_buffer info( param_size ); - err = clGetPlatformInfo(platform_id, info_type, info.size(), info.data(), 0); - cl_ensure(err, "clGetPlatformInfo()"); - - // Return - return info; + // Retrieve + hpx::serialization::serialize_buffer info(param_size); + err = clGetPlatformInfo(platform_id, info_type, info.size(), info.data(), 0); + cl_ensure(err, "clGetPlatformInfo()"); + // Return + return info; } - -void CL_CALLBACK -device::error_callback(const char* errinfo, const void* info, std::size_t info_size, - void* _thisp) -{ - device* thisp = (device*) _thisp; - hpx::cerr << "device(" << thisp->device_id << "): CONTEXT_ERROR: " - << errinfo << hpx::endl; +void CL_CALLBACK device::error_callback(const char* errinfo, const void* info, + std::size_t info_size, void* _thisp) { + device* thisp = (device*)_thisp; + hpx::cerr << "device(" << thisp->device_id << "): CONTEXT_ERROR: " << errinfo + << hpx::endl; } -cl_context -device::get_context() -{ - return context; -} - -cl_device_id -device::get_device_id() -{ - return device_id; -} +cl_context device::get_context() { return context; } -hpx::id_type -device::create_buffer( cl_mem_flags flags, std::size_t size ) -{ +cl_device_id device::get_device_id() { return device_id; } - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +hpx::id_type device::create_buffer(cl_mem_flags flags, std::size_t size) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // Create new buffer - hpx::id_type buf = hpx::components::new_ - ( hpx::find_here() ).get(); + // Create new buffer + hpx::id_type buf = + hpx::components::new_(hpx::find_here()) + .get(); - // Initialize buffer locally - std::shared_ptr buffer_server = - hpx::get_ptr( buf ).get(); + // Initialize buffer locally + std::shared_ptr buffer_server = + hpx::get_ptr(buf).get(); - buffer_server->init(get_id(), flags, size); + buffer_server->init(get_id(), flags, size); - return buf; + return buf; } -hpx::id_type -device::create_program_with_source( - hpx::serialization::serialize_buffer src ) -{ +hpx::id_type device::create_program_with_source( + hpx::serialization::serialize_buffer src) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + // Create new program + hpx::id_type prog = + hpx::components::new_(hpx::find_here()) + .get(); - // Create new program - hpx::id_type prog = hpx::components::new_ - ( hpx::find_here() ).get(); + // Initialize buffer locally + std::shared_ptr program_server = + hpx::get_ptr(prog).get(); - // Initialize buffer locally - std::shared_ptr program_server = - hpx::get_ptr( prog ).get(); + program_server->init_with_source(get_id(), src); - program_server->init_with_source( get_id(), src ); - - return prog; + return prog; } -hpx::id_type -device::create_program_with_binary( - hpx::serialization::serialize_buffer binary ) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +hpx::id_type device::create_program_with_binary( + hpx::serialization::serialize_buffer binary) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // Create new program - hpx::id_type prog = hpx::components::new_ - ( hpx::find_here() ).get(); + // Create new program + hpx::id_type prog = + hpx::components::new_(hpx::find_here()) + .get(); - // Initialize buffer locally - std::shared_ptr program_server = - hpx::get_ptr( prog ).get(); + // Initialize buffer locally + std::shared_ptr program_server = + hpx::get_ptr(prog).get(); - program_server->init_with_binary( get_id(), binary ); + program_server->init_with_binary(get_id(), binary); - return prog; + return prog; } -void -device::release_event(hpx::naming::gid_type gid) -{ - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - // release data registered on event - delete_event_data(event_map.get(gid)); +void device::release_event(hpx::naming::gid_type gid) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // delete event from map - event_map.remove(gid); + // release data registered on event + delete_event_data(event_map.get(gid)); + // delete event from map + event_map.remove(gid); } +void device::delete_event(cl_event event) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); -void -device::delete_event( cl_event event ) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - // delete the actual cl_event object - cl_int err; - err = clReleaseEvent(event); - cl_ensure(err, "clReleaseEvent()"); - -} - -void -device::register_event( const hpx::naming::id_type & gid, cl_event event ) -{ - - // Add pair to event_map - event_map.add(gid, event); - + // delete the actual cl_event object + cl_int err; + err = clReleaseEvent(event); + cl_ensure(err, "clReleaseEvent()"); } -cl_event -device::retrieve_event( const hpx::naming::id_type & gid ) -{ - - // Get event from event_map - return event_map.get(gid); - +void device::register_event(const hpx::naming::id_type& gid, cl_event event) { + // Add pair to event_map + event_map.add(gid, event); } - -cl_command_queue -device::get_read_command_queue() -{ - return command_queue; +cl_event device::retrieve_event(const hpx::naming::id_type& gid) { + // Get event from event_map + return event_map.get(gid); } -cl_command_queue -device::get_write_command_queue() -{ - return command_queue; -} +cl_command_queue device::get_read_command_queue() { return command_queue; } -cl_command_queue -device::get_kernel_command_queue() -{ - return command_queue; -} +cl_command_queue device::get_write_command_queue() { return command_queue; } -void -device::wait_for_cl_event(cl_event event) -{ - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +cl_command_queue device::get_kernel_command_queue() { return command_queue; } - cl_int err; - cl_int execution_state = CL_RUNNING; +void device::wait_for_cl_event(cl_event event) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // Loop until the event state turns to true. - // Previous attempts used clSetEventCallback, but it turned out to - // be really slow. - for(std::size_t i = 0; execution_state != CL_COMPLETE; i++){ + cl_int err; + cl_int execution_state = CL_RUNNING; - // Do exponential backup - //hpx::util::yield_while(i); + // Loop until the event state turns to true. + // Previous attempts used clSetEventCallback, but it turned out to + // be really slow. + for (std::size_t i = 0; execution_state != CL_COMPLETE; i++) { + // Do exponential backup + // hpx::util::yield_while(i); - hpx::util::yield_while([i]() - { return i; }); + hpx::util::yield_while([i]() { return i; }); + // Query OpenCL for the event state + err = clGetEventInfo(event, CL_EVENT_COMMAND_EXECUTION_STATUS, + sizeof(cl_int), &execution_state, NULL); + cl_ensure(err, "clGetEventInfo"); + } - - // Query OpenCL for the event state - err = clGetEventInfo( event, CL_EVENT_COMMAND_EXECUTION_STATUS, - sizeof(cl_int), &execution_state, NULL ); - cl_ensure(err, "clGetEventInfo"); - - } - - // Check for internal errors - cl_ensure(execution_state, "OpenCL Internal Error!"); - + // Check for internal errors + cl_ensure(execution_state, "OpenCL Internal Error!"); } -void -device::delete_event_data(cl_event event) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +void device::delete_event_data(cl_event event) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // return if no data is registered - if(!event_data_map.has_data(event)) - return; + // return if no data is registered + if (!event_data_map.has_data(event)) return; - // wait for event to trigger (clEnqueueX-call could still be using - // the memory) - wait_for_cl_event(event); - - // release the data - event_data_map.remove(event); + // wait for event to trigger (clEnqueueX-call could still be using + // the memory) + wait_for_cl_event(event); + // release the data + event_data_map.remove(event); } -void -device::activate_deferred_event(hpx::naming::id_type event_id) -{ - // get the cl_event - cl_event event = event_map.get(event_id); - - // wait for the cl_event to complete - wait_for_cl_event(event); +void device::activate_deferred_event(hpx::naming::id_type event_id) { + // get the cl_event + cl_event event = event_map.get(event_id); - // trigger the client event - hpx::trigger_lco_event(event_id, false); + // wait for the cl_event to complete + wait_for_cl_event(event); + // trigger the client event + hpx::trigger_lco_event(event_id, false); } -void -device::activate_deferred_event_with_data(hpx::naming::id_type event_id) -{ - - // get the cl_event - cl_event event = event_map.get(event_id); - - // find the data associated with the event - auto data = event_data_map.get(event); +void device::activate_deferred_event_with_data(hpx::naming::id_type event_id) { + // get the cl_event + cl_event event = event_map.get(event_id); - // wait for the event to trigger - wait_for_cl_event(event); + // find the data associated with the event + auto data = event_data_map.get(event); - // send the data to the client - data.send_data_to_client(event_id); + // wait for the event to trigger + wait_for_cl_event(event); + // send the data to the client + data.send_data_to_client(event_id); } diff --git a/opencl/server/kernel.hpp b/opencl/server/kernel.hpp index 34c77e08..3c52a0a1 100644 --- a/opencl/server/kernel.hpp +++ b/opencl/server/kernel.hpp @@ -7,7 +7,6 @@ #ifndef HPX_OPENCL_SERVER_KERNEL_HPP #define HPX_OPENCL_SERVER_KERNEL_HPP - #include #include @@ -18,63 +17,61 @@ // REGISTER_ACTION_DECLARATION templates #include "util/server_definitions.hpp" -namespace hpx { namespace opencl{ namespace server{ - - // ///////////////////////////////////////////////////// - // This class represents an opencl kernel. - - class HPX_OPENCL_EXPORT kernel - : public hpx::components::managed_component_base - { - public: - - // Constructor - kernel(); - // Destructor - ~kernel(); - - /////////////////////////////////////////////////// - /// Local functions - /// - void init ( hpx::naming::id_type device_id, cl_program program, - std::string kernel_name ); - - ////////////////////////////////////////////////// - /// Exposed functionality of this component - /// - - // Returns the parent device - hpx::naming::id_type get_parent_device_id(); - - // Sets an argument of the kernel - void set_arg(cl_uint arg_index, hpx::naming::id_type buffer); - - // Runs the kernel - void enqueue( hpx::naming::id_type && event_gid, - std::vector size, - std::vector && dependencies ); - - HPX_DEFINE_COMPONENT_ACTION(kernel, get_parent_device_id); - HPX_DEFINE_COMPONENT_ACTION(kernel, set_arg); - HPX_DEFINE_COMPONENT_ACTION(kernel, enqueue); - - ////////////////////////////////////////////////// - // Private Member Functions - // - private: - - - ////////////////////////////////////////////////// - // Private Member Variables - // - private: - std::shared_ptr parent_device; - cl_kernel kernel_id; - hpx::naming::id_type parent_device_id; - - }; - -}}} +namespace hpx { +namespace opencl { +namespace server { + +// ///////////////////////////////////////////////////// +// This class represents an opencl kernel. + +class HPX_OPENCL_EXPORT kernel + : public hpx::components::managed_component_base { + public: + // Constructor + kernel(); + // Destructor + ~kernel(); + + /////////////////////////////////////////////////// + /// Local functions + /// + void init(hpx::naming::id_type device_id, cl_program program, + std::string kernel_name); + + ////////////////////////////////////////////////// + /// Exposed functionality of this component + /// + + // Returns the parent device + hpx::naming::id_type get_parent_device_id(); + + // Sets an argument of the kernel + void set_arg(cl_uint arg_index, hpx::naming::id_type buffer); + + // Runs the kernel + void enqueue(hpx::naming::id_type&& event_gid, std::vector size, + std::vector&& dependencies); + + HPX_DEFINE_COMPONENT_ACTION(kernel, get_parent_device_id); + HPX_DEFINE_COMPONENT_ACTION(kernel, set_arg); + HPX_DEFINE_COMPONENT_ACTION(kernel, enqueue); + + ////////////////////////////////////////////////// + // Private Member Functions + // + private: + ////////////////////////////////////////////////// + // Private Member Variables + // + private: + std::shared_ptr parent_device; + cl_kernel kernel_id; + hpx::naming::id_type parent_device_id; +}; + +} // namespace server +} // namespace opencl +} // namespace hpx //[opencl_management_registration_declarations HPX_OPENCL_REGISTER_ACTION_DECLARATION(kernel, get_parent_device_id); diff --git a/opencl/server/kernel_server.cpp b/opencl/server/kernel_server.cpp index 746c3749..cc4e14a9 100644 --- a/opencl/server/kernel_server.cpp +++ b/opencl/server/kernel_server.cpp @@ -17,135 +17,106 @@ #include #include - using hpx::opencl::server::kernel; - // Constructor -kernel::kernel() -{} +kernel::kernel() {} // External destructor. // This is needed because OpenCL calls only run properly on large stack size. -static void kernel_cleanup(uintptr_t kernel_id_ptr) -{ +static void kernel_cleanup(uintptr_t kernel_id_ptr) { + cl_int err; - cl_int err; + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + cl_kernel kernel_id = reinterpret_cast(kernel_id_ptr); - cl_kernel kernel_id = reinterpret_cast(kernel_id_ptr); - - // Release the device memory - if(kernel_id) - { - err = clReleaseKernel(kernel_id); - cl_ensure_nothrow(err, "clReleaseKernel()"); - } + // Release the device memory + if (kernel_id) { + err = clReleaseKernel(kernel_id); + cl_ensure_nothrow(err, "clReleaseKernel()"); + } } // Destructor -kernel::~kernel() -{ - - hpx::threads::executors::default_executor exec( - hpx::threads::thread_priority_normal, - hpx::threads::thread_stacksize_medium); - - // run dectructor in a thread, as we need it to run on a large stack size - hpx::threads::async_execute( exec, &kernel_cleanup, reinterpret_cast(kernel_id)) - .wait(); - - +kernel::~kernel() { + hpx::threads::executors::default_executor exec( + hpx::threads::thread_priority_normal, + hpx::threads::thread_stacksize_medium); + + // run dectructor in a thread, as we need it to run on a large stack size + hpx::threads::async_execute(exec, &kernel_cleanup, + reinterpret_cast(kernel_id)) + .wait(); } -hpx::naming::id_type kernel::get_parent_device_id() -{ - return parent_device_id; -} +hpx::naming::id_type kernel::get_parent_device_id() { return parent_device_id; } -void -kernel::init( hpx::naming::id_type device_id, cl_program program, - std::string kernel_name ) -{ +void kernel::init(hpx::naming::id_type device_id, cl_program program, + std::string kernel_name) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + this->parent_device_id = std::move(device_id); + this->parent_device = + hpx::get_ptr(parent_device_id).get(); + this->kernel_id = NULL; - this->parent_device_id = std::move(device_id); - this->parent_device = hpx::get_ptr - (parent_device_id).get(); - this->kernel_id = NULL; - - // The opencl error variable - cl_int err; - - // Create the cl_program - kernel_id = clCreateKernel( program, kernel_name.c_str(), &err ); - cl_ensure(err, "clCreateKernel()"); + // The opencl error variable + cl_int err; + // Create the cl_program + kernel_id = clCreateKernel(program, kernel_name.c_str(), &err); + cl_ensure(err, "clCreateKernel()"); } +void kernel::set_arg(cl_uint arg_index, hpx::naming::id_type buffer_id) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + cl_int err; -void -kernel::set_arg(cl_uint arg_index, hpx::naming::id_type buffer_id) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - cl_int err; - - // Get direct pointer to buffer - auto buffer = hpx::get_ptr(buffer_id).get(); + // Get direct pointer to buffer + auto buffer = hpx::get_ptr(buffer_id).get(); - // Get cl_mem - cl_mem mem_id = buffer->get_cl_mem(); - - // Set the argument - err = clSetKernelArg(kernel_id, arg_index, sizeof(cl_mem), &mem_id); - cl_ensure(err, "clSetKernelArg()"); + // Get cl_mem + cl_mem mem_id = buffer->get_cl_mem(); + // Set the argument + err = clSetKernelArg(kernel_id, arg_index, sizeof(cl_mem), &mem_id); + cl_ensure(err, "clSetKernelArg()"); } -void -kernel::enqueue( hpx::naming::id_type && event_gid, - std::vector size_vec, - std::vector && dependencies ) -{ - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - cl_int err; - cl_event return_event; - - // retrieve the dependency cl_events - util::event_dependencies events( dependencies, parent_device.get() ); - - // retrieve the command queue - cl_command_queue command_queue = parent_device->get_kernel_command_queue(); - - // prepare args for OpenCL call - HPX_ASSERT( size_vec.size() % 3 == 0 ); - std::size_t size = size_vec.size() / 3; - std::size_t* global_work_offset = size_vec.data() + 0 * size; - std::size_t* global_work_size = size_vec.data() + 1 * size; - std::size_t* local_work_size = size_vec.data() + 2 * size; - - // If local_work_size is not specified, let the OpenCL driver decide - if(local_work_size[0] == 0){ - local_work_size = NULL; - } - - // run the OpenCL-call - err = clEnqueueNDRangeKernel( command_queue, kernel_id, - static_cast(size), - global_work_offset, - global_work_size, - local_work_size, - static_cast(events.size()), - events.get_cl_events(), - &return_event ); - cl_ensure(err, "clEnqueueNDRangeKernel()"); - - // register the cl_event to the client event - parent_device->register_event(event_gid, return_event); - +void kernel::enqueue(hpx::naming::id_type&& event_gid, + std::vector size_vec, + std::vector&& dependencies) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + cl_int err; + cl_event return_event; + + // retrieve the dependency cl_events + util::event_dependencies events(dependencies, parent_device.get()); + + // retrieve the command queue + cl_command_queue command_queue = parent_device->get_kernel_command_queue(); + + // prepare args for OpenCL call + HPX_ASSERT(size_vec.size() % 3 == 0); + std::size_t size = size_vec.size() / 3; + std::size_t* global_work_offset = size_vec.data() + 0 * size; + std::size_t* global_work_size = size_vec.data() + 1 * size; + std::size_t* local_work_size = size_vec.data() + 2 * size; + + // If local_work_size is not specified, let the OpenCL driver decide + if (local_work_size[0] == 0) { + local_work_size = NULL; + } + + // run the OpenCL-call + err = clEnqueueNDRangeKernel( + command_queue, kernel_id, static_cast(size), global_work_offset, + global_work_size, local_work_size, static_cast(events.size()), + events.get_cl_events(), &return_event); + cl_ensure(err, "clEnqueueNDRangeKernel()"); + + // register the cl_event to the client event + parent_device->register_event(event_gid, return_event); } - diff --git a/opencl/server/program.hpp b/opencl/server/program.hpp index 4572482a..990c0251 100644 --- a/opencl/server/program.hpp +++ b/opencl/server/program.hpp @@ -7,7 +7,6 @@ #ifndef HPX_OPENCL_SERVER_PROGRAM_HPP #define HPX_OPENCL_SERVER_PROGRAM_HPP - #include #include @@ -18,74 +17,73 @@ // REGISTER_ACTION_DECLARATION templates #include "util/server_definitions.hpp" -namespace hpx { namespace opencl{ namespace server{ - - // ///////////////////////////////////////////////////// - // This class represents an opencl program. - - class HPX_OPENCL_EXPORT program - : public hpx::components::managed_component_base - { - public: - - // Constructor - program(); - // Destructor - ~program(); - - /////////////////////////////////////////////////// - /// Local functions - /// - void init_with_source( hpx::naming::id_type device_id, - hpx::serialization::serialize_buffer src); - void init_with_binary( hpx::naming::id_type device_id, - hpx::serialization::serialize_buffer binary); - - ////////////////////////////////////////////////// - /// Exposed functionality of this component - /// - - // Returns the parent device - hpx::naming::id_type get_parent_device_id(); - - // builds the program. - // mutually exclusive to compile() and link(). - void build(std::string options); - - // Returns the binary representation of the program - hpx::serialization::serialize_buffer get_binary(); - - // creates a kernel from the buffer - hpx::naming::id_type create_kernel(std::string kernel_name); - - HPX_DEFINE_COMPONENT_ACTION(program, get_parent_device_id); - HPX_DEFINE_COMPONENT_ACTION(program, build); - HPX_DEFINE_COMPONENT_ACTION(program, get_binary); - HPX_DEFINE_COMPONENT_ACTION(program, create_kernel); - - ////////////////////////////////////////////////// - // Private Member Functions - // - private: - - // returns the build log - std::string acquire_build_log(); - - // checks for build errors - void throw_on_build_errors(const char* function_name); - - - ////////////////////////////////////////////////// - // Private Member Variables - // - private: - std::shared_ptr parent_device; - cl_program program_id; - hpx::naming::id_type parent_device_id; - - }; - -}}} +namespace hpx { +namespace opencl { +namespace server { + +// ///////////////////////////////////////////////////// +// This class represents an opencl program. + +class HPX_OPENCL_EXPORT program + : public hpx::components::managed_component_base { + public: + // Constructor + program(); + // Destructor + ~program(); + + /////////////////////////////////////////////////// + /// Local functions + /// + void init_with_source(hpx::naming::id_type device_id, + hpx::serialization::serialize_buffer src); + void init_with_binary(hpx::naming::id_type device_id, + hpx::serialization::serialize_buffer binary); + + ////////////////////////////////////////////////// + /// Exposed functionality of this component + /// + + // Returns the parent device + hpx::naming::id_type get_parent_device_id(); + + // builds the program. + // mutually exclusive to compile() and link(). + void build(std::string options); + + // Returns the binary representation of the program + hpx::serialization::serialize_buffer get_binary(); + + // creates a kernel from the buffer + hpx::naming::id_type create_kernel(std::string kernel_name); + + HPX_DEFINE_COMPONENT_ACTION(program, get_parent_device_id); + HPX_DEFINE_COMPONENT_ACTION(program, build); + HPX_DEFINE_COMPONENT_ACTION(program, get_binary); + HPX_DEFINE_COMPONENT_ACTION(program, create_kernel); + + ////////////////////////////////////////////////// + // Private Member Functions + // + private: + // returns the build log + std::string acquire_build_log(); + + // checks for build errors + void throw_on_build_errors(const char* function_name); + + ////////////////////////////////////////////////// + // Private Member Variables + // + private: + std::shared_ptr parent_device; + cl_program program_id; + hpx::naming::id_type parent_device_id; +}; + +} // namespace server +} // namespace opencl +} // namespace hpx //[opencl_management_registration_declarations HPX_OPENCL_REGISTER_ACTION_DECLARATION(program, get_parent_device_id); diff --git a/opencl/server/program_server.cpp b/opencl/server/program_server.cpp index 77b77cc2..6ba68503 100644 --- a/opencl/server/program_server.cpp +++ b/opencl/server/program_server.cpp @@ -20,296 +20,256 @@ using hpx::opencl::server::program; - // Constructor -program::program() -{} +program::program() {} // External destructor. // This is needed because OpenCL calls only run properly on large stack size. -static void program_cleanup(uintptr_t program_id_ptr) -{ - - cl_int err; +static void program_cleanup(uintptr_t program_id_ptr) { + cl_int err; - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - cl_program program_id = reinterpret_cast(program_id_ptr); + cl_program program_id = reinterpret_cast(program_id_ptr); - // Release the device memory - if(program_id) - { - err = clReleaseProgram(program_id); - cl_ensure_nothrow(err, "clReleaseProgram()"); - } + // Release the device memory + if (program_id) { + err = clReleaseProgram(program_id); + cl_ensure_nothrow(err, "clReleaseProgram()"); + } } // Destructor -program::~program() -{ - - hpx::threads::executors::default_executor exec( - hpx::threads::thread_priority_normal, - hpx::threads::thread_stacksize_medium); - - // run dectructor in a thread, as we need it to run on a large stack size - hpx::threads::async_execute( exec, &program_cleanup, reinterpret_cast(program_id)) - .wait(); - - +program::~program() { + hpx::threads::executors::default_executor exec( + hpx::threads::thread_priority_normal, + hpx::threads::thread_stacksize_medium); + + // run dectructor in a thread, as we need it to run on a large stack size + hpx::threads::async_execute(exec, &program_cleanup, + reinterpret_cast(program_id)) + .wait(); } -hpx::naming::id_type program::get_parent_device_id() -{ - return parent_device_id; +hpx::naming::id_type program::get_parent_device_id() { + return parent_device_id; } -void -program::init_with_source( hpx::naming::id_type device_id, - hpx::serialization::serialize_buffer src ) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - this->parent_device_id = std::move(device_id); - this->parent_device = hpx::get_ptr - (parent_device_id).get(); - this->program_id = NULL; - - // Retrieve the context from parent class - cl_context context = parent_device->get_context(); - - // The opencl error variable - cl_int err; - - // Set up data for OpenCL call - HPX_ASSERT(src.size() > 0); - std::size_t src_size = src.size(); - const char* src_data = src.data(); - if(src_data[src_size - 1] == '\0'){ - // Decrease one if zero-terminated, as - // OpenCL specifies 'length of source string excluding null terminator' - src_size --; - } - - // Create the cl_program - program_id = clCreateProgramWithSource( context, 1, &src_data, &src_size, - &err ); - cl_ensure(err, "clCreateProgramWithSource()"); - +void program::init_with_source(hpx::naming::id_type device_id, + hpx::serialization::serialize_buffer src) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + this->parent_device_id = std::move(device_id); + this->parent_device = + hpx::get_ptr(parent_device_id).get(); + this->program_id = NULL; + + // Retrieve the context from parent class + cl_context context = parent_device->get_context(); + + // The opencl error variable + cl_int err; + + // Set up data for OpenCL call + HPX_ASSERT(src.size() > 0); + std::size_t src_size = src.size(); + const char* src_data = src.data(); + if (src_data[src_size - 1] == '\0') { + // Decrease one if zero-terminated, as + // OpenCL specifies 'length of source string excluding null terminator' + src_size--; + } + + // Create the cl_program + program_id = + clCreateProgramWithSource(context, 1, &src_data, &src_size, &err); + cl_ensure(err, "clCreateProgramWithSource()"); } -void -program::init_with_binary( hpx::naming::id_type device_id, - hpx::serialization::serialize_buffer binary ) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - this->parent_device_id = std::move(device_id); - this->parent_device = hpx::get_ptr - (parent_device_id).get(); - this->program_id = NULL; - - // Retrieve the context from parent class - cl_context context = parent_device->get_context(); - - // The opencl error variable - cl_int err; - - // Set up data for OpenCL call - HPX_ASSERT(binary.size() > 0); - cl_device_id device = parent_device->get_device_id(); - const std::size_t size = binary.size(); - const unsigned char* bin = reinterpret_cast(binary.data()); - - // Create the cl_program - cl_int binary_status; - program_id = clCreateProgramWithBinary( context, - 1, &device, &size, &bin, - &binary_status, &err ); - cl_ensure(err, "clCreateProgramWithBinary()"); - cl_ensure(binary_status, "clCreateProgramWithBinary().binary_status"); - +void program::init_with_binary( + hpx::naming::id_type device_id, + hpx::serialization::serialize_buffer binary) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + this->parent_device_id = std::move(device_id); + this->parent_device = + hpx::get_ptr(parent_device_id).get(); + this->program_id = NULL; + + // Retrieve the context from parent class + cl_context context = parent_device->get_context(); + + // The opencl error variable + cl_int err; + + // Set up data for OpenCL call + HPX_ASSERT(binary.size() > 0); + cl_device_id device = parent_device->get_device_id(); + const std::size_t size = binary.size(); + const unsigned char* bin = reinterpret_cast(binary.data()); + + // Create the cl_program + cl_int binary_status; + program_id = clCreateProgramWithBinary(context, 1, &device, &size, &bin, + &binary_status, &err); + cl_ensure(err, "clCreateProgramWithBinary()"); + cl_ensure(binary_status, "clCreateProgramWithBinary().binary_status"); } -std::string -program::acquire_build_log() -{ - cl_int err; - - std::size_t build_log_size; - - // Query size - err = clGetProgramBuildInfo(program_id, parent_device->get_device_id(), - CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size); - - // Create buffer - std::vector buf(build_log_size); - - // Get log - err = clGetProgramBuildInfo(program_id, parent_device->get_device_id(), - CL_PROGRAM_BUILD_LOG, build_log_size, - buf.data(), NULL); - - // make build log look nice in exception - std::stringstream sstream; - sstream << std::endl << std::endl; - sstream << "//////////////////////////////////////" << std::endl; - sstream << "/// OPENCL BUILD LOG" << std::endl; - sstream << "///" << std::endl; - sstream << std::endl << buf.data() << std::endl; - sstream << "///" << std::endl; - sstream << "/// OPENCL BUILD LOG END" << std::endl; - sstream << "//////////////////////////////////////" << std::endl; - sstream << std::endl; - - // return the nice looking error string. - return sstream.str(); - +std::string program::acquire_build_log() { + cl_int err; + + std::size_t build_log_size; + + // Query size + err = clGetProgramBuildInfo(program_id, parent_device->get_device_id(), + CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size); + + // Create buffer + std::vector buf(build_log_size); + + // Get log + err = clGetProgramBuildInfo(program_id, parent_device->get_device_id(), + CL_PROGRAM_BUILD_LOG, build_log_size, buf.data(), + NULL); + + // make build log look nice in exception + std::stringstream sstream; + sstream << std::endl << std::endl; + sstream << "//////////////////////////////////////" << std::endl; + sstream << "/// OPENCL BUILD LOG" << std::endl; + sstream << "///" << std::endl; + sstream << std::endl << buf.data() << std::endl; + sstream << "///" << std::endl; + sstream << "/// OPENCL BUILD LOG END" << std::endl; + sstream << "//////////////////////////////////////" << std::endl; + sstream << std::endl; + + // return the nice looking error string. + return sstream.str(); } -void -program::throw_on_build_errors(const char* function_name){ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +void program::throw_on_build_errors(const char* function_name) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - cl_int err; - cl_build_status build_status; + cl_int err; + cl_build_status build_status; - // Read build status - err = clGetProgramBuildInfo( program_id, parent_device->get_device_id(), - CL_PROGRAM_BUILD_STATUS, - sizeof(cl_build_status), &build_status, NULL ); - cl_ensure(err, "clGetProgramBuildInfo()"); + // Read build status + err = clGetProgramBuildInfo(program_id, parent_device->get_device_id(), + CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), + &build_status, NULL); + cl_ensure(err, "clGetProgramBuildInfo()"); - // Throw if build did not succeed - if(build_status != CL_BUILD_SUCCESS) - { - HPX_THROW_EXCEPTION(hpx::no_success, function_name, - std::string("A build error occured!") + - acquire_build_log()); - } + // Throw if build did not succeed + if (build_status != CL_BUILD_SUCCESS) { + HPX_THROW_EXCEPTION( + hpx::no_success, function_name, + std::string("A build error occured!") + acquire_build_log()); + } } -struct build_callback_args{ - hpx::runtime* rt; - hpx::lcos::local::promise* promise; +struct build_callback_args { + hpx::runtime* rt; + hpx::lcos::local::promise* promise; }; -static void CL_CALLBACK -build_callback( cl_program program_id, void* user_data ) -{ - // Cast arguments - build_callback_args* args = - static_cast(user_data); +static void CL_CALLBACK build_callback(cl_program program_id, void* user_data) { + // Cast arguments + build_callback_args* args = static_cast(user_data); - // Send exec status to waiting future - using hpx::opencl::server::util::set_promise_from_external; - set_promise_from_external ( args->rt, args->promise ); + // Send exec status to waiting future + using hpx::opencl::server::util::set_promise_from_external; + set_promise_from_external(args->rt, args->promise); } -void -program::build(std::string options) -{ - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); +void program::build(std::string options) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - cl_int err; + cl_int err; - // fetch device id from parent device - cl_device_id device_id = parent_device->get_device_id(); + // fetch device id from parent device + cl_device_id device_id = parent_device->get_device_id(); - // Create a new promise - hpx::lcos::local::promise promise; + // Create a new promise + hpx::lcos::local::promise promise; - // Retrieve the future - hpx::future future = promise.get_future(); + // Retrieve the future + hpx::future future = promise.get_future(); - // Create args for build_callback - build_callback_args args; - args.rt = hpx::get_runtime_ptr(); - args.promise = &promise; + // Create args for build_callback + build_callback_args args; + args.rt = hpx::get_runtime_ptr(); + args.promise = &promise; - // Initialize compilation - err = clBuildProgram( program_id, 1, &device_id, options.c_str(), - &build_callback, &args ); + // Initialize compilation + err = clBuildProgram(program_id, 1, &device_id, options.c_str(), + &build_callback, &args); - // ignore CL_BUILD_PROGRAM_FAILURE. - // we handle this case in throw_on_build_errors() - if(err != CL_BUILD_PROGRAM_FAILURE) - cl_ensure(err, "clBuildProgram()"); + // ignore CL_BUILD_PROGRAM_FAILURE. + // we handle this case in throw_on_build_errors() + if (err != CL_BUILD_PROGRAM_FAILURE) cl_ensure(err, "clBuildProgram()"); - // Wait for compilation to finish - future.wait(); - - // check build status - throw_on_build_errors("clBuildProgram()"); + // Wait for compilation to finish + future.wait(); + // check build status + throw_on_build_errors("clBuildProgram()"); } -hpx::serialization::serialize_buffer -program::get_binary() -{ - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - typedef hpx::serialization::serialize_buffer buffer_type; - cl_int err; - - // get number of devices - cl_uint num_devices; - err = clGetProgramInfo(program_id, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), - &num_devices, NULL); - cl_ensure(err, "clGetProgramInfo()"); - - // ensure that only one device is associated - if(num_devices != 1) - { - HPX_THROW_EXCEPTION(hpx::internal_server_error, "program::get_binary()", - "Internal Error: More than one device linked!"); - } - - // get binary size - std::size_t binary_size; - err = clGetProgramInfo(program_id, CL_PROGRAM_BINARY_SIZES, - sizeof(std::size_t), &binary_size, NULL); - cl_ensure(err, "clGetProgramInfo()"); - - // ensure that there actually is binary code - if(binary_size == 0) - { - HPX_THROW_EXCEPTION(hpx::no_success, "program::get_binary()", - "Unable to fetch binary code!"); - } - - // get binary code - buffer_type binary( binary_size ); - char* binary_ptr = binary.data(); - err = clGetProgramInfo( program_id, CL_PROGRAM_BINARIES, - sizeof(unsigned char*), - &binary_ptr, - NULL ); - cl_ensure(err, "clGetProgramInfo()"); - - // return vector - return binary; - +hpx::serialization::serialize_buffer program::get_binary() { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); + + typedef hpx::serialization::serialize_buffer buffer_type; + cl_int err; + + // get number of devices + cl_uint num_devices; + err = clGetProgramInfo(program_id, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), + &num_devices, NULL); + cl_ensure(err, "clGetProgramInfo()"); + + // ensure that only one device is associated + if (num_devices != 1) { + HPX_THROW_EXCEPTION(hpx::internal_server_error, "program::get_binary()", + "Internal Error: More than one device linked!"); + } + + // get binary size + std::size_t binary_size; + err = clGetProgramInfo(program_id, CL_PROGRAM_BINARY_SIZES, + sizeof(std::size_t), &binary_size, NULL); + cl_ensure(err, "clGetProgramInfo()"); + + // ensure that there actually is binary code + if (binary_size == 0) { + HPX_THROW_EXCEPTION(hpx::no_success, "program::get_binary()", + "Unable to fetch binary code!"); + } + + // get binary code + buffer_type binary(binary_size); + char* binary_ptr = binary.data(); + err = clGetProgramInfo(program_id, CL_PROGRAM_BINARIES, + sizeof(unsigned char*), &binary_ptr, NULL); + cl_ensure(err, "clGetProgramInfo()"); + + // return vector + return binary; } -hpx::naming::id_type -program::create_kernel(std::string kernel_name) -{ - - HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - - // Create new kernel - hpx::id_type kernel = hpx::components::new_ - ( hpx::find_here() ).get(); +hpx::naming::id_type program::create_kernel(std::string kernel_name) { + HPX_ASSERT(hpx::opencl::tools::runs_on_medium_stack()); - // Initialize kernel locally - auto kernel_server = hpx::get_ptr(kernel).get(); + // Create new kernel + hpx::id_type kernel = + hpx::components::new_(hpx::find_here()) + .get(); - kernel_server->init(parent_device_id, program_id, kernel_name); + // Initialize kernel locally + auto kernel_server = hpx::get_ptr(kernel).get(); - return kernel; + kernel_server->init(parent_device_id, program_id, kernel_name); + return kernel; } diff --git a/opencl/server/util/data_map.cpp b/opencl/server/util/data_map.cpp index 0d9fd0d9..d9469907 100644 --- a/opencl/server/util/data_map.cpp +++ b/opencl/server/util/data_map.cpp @@ -6,85 +6,67 @@ // The Header of this class #include "data_map.hpp" - using hpx::opencl::server::util::data_map; using hpx::opencl::server::util::data_map_entry; -data_map::data_map(){ +data_map::data_map() {} +data_map::~data_map() { + // Correct use removes all entries from this map before deletion + HPX_ASSERT(map.empty()); } -data_map::~data_map(){ - // Correct use removes all entries from this map before deletion - HPX_ASSERT(map.empty()); -} +void data_map::remove(cl_event event) { + // Lock + // lock_type::scoped_lock l(lock); + std::lock_guard lock(this->m); + // Remove element + map.erase(event); +} +data_map_entry data_map::get(cl_event event) { + data_map_entry result; -void -data_map::remove(cl_event event) -{ + // get data from the map + { // Lock - //lock_type::scoped_lock l(lock); - std::lock_guard lock(this->m); - - // Remove element - map.erase(event); -} + // lock_type::scoped_lock l(lock); + std::lock_guard lock(this->m); + // Retrieve the data from the map + map_type::iterator it = map.find(event); -data_map_entry -data_map::get(cl_event event) -{ - data_map_entry result; - - // get data from the map - { - // Lock - //lock_type::scoped_lock l(lock); - std::lock_guard lock(this->m); - - // Retrieve the data from the map - map_type::iterator it = map.find(event); - - // Make sure the data actually exists - HPX_ASSERT(it != map.end()); - - // Get the data entry - result = it->second; - } - - return result; -} + // Make sure the data actually exists + HPX_ASSERT(it != map.end()); -bool -data_map::has_data(cl_event event) -{ + // Get the data entry + result = it->second; + } - bool result = true; - { - // Lock - //lock_type::scoped_lock l(lock); - std::lock_guard lock(this->m); + return result; +} +bool data_map::has_data(cl_event event) { + bool result = true; + { + // Lock + // lock_type::scoped_lock l(lock); + std::lock_guard lock(this->m); - // Try to find the entry - map_type::iterator it = map.find(event); + // Try to find the entry + map_type::iterator it = map.find(event); - // Check wether or not we found the entry - if(it == map.end()) - result = false; - } + // Check wether or not we found the entry + if (it == map.end()) result = false; + } - return result; + return result; } -void -data_map_entry::send_data_to_client(const hpx::naming::id_type& client_event) -{ - // no synchronization necessary, should only get called once - // (at least the client event has to make sure this only gets called once) - send_callback(client_event); +void data_map_entry::send_data_to_client( + const hpx::naming::id_type& client_event) { + // no synchronization necessary, should only get called once + // (at least the client event has to make sure this only gets called once) + send_callback(client_event); } - - diff --git a/opencl/server/util/data_map.hpp b/opencl/server/util/data_map.hpp index f82ad61c..15251ae5 100644 --- a/opencl/server/util/data_map.hpp +++ b/opencl/server/util/data_map.hpp @@ -16,103 +16,98 @@ #include //////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl{ namespace server{ namespace util{ +namespace hpx { +namespace opencl { +namespace server { +namespace util { +//////////////////////////////////////////////////////// +// This class is used to hide the template parameter from serialize_buffer. +// +class data_map_entry { + private: + template + static void send_to_client_impl( + hpx::serialization::serialize_buffer data, + const hpx::naming::id_type& event_id) { + hpx::set_lco_value(event_id, data, false); + } + + public: + template + void set_data(hpx::serialization::serialize_buffer data) { + // The data itself does not need to explicitely get kept alive, + // it gets kept alive inside of the bind. + + send_callback = hpx::util::bind(&send_to_client_impl, data, + hpx::util::placeholders::_1); + } + + // Sends the data to the client event (to trigger client future) + HPX_OPENCL_EXPORT void send_data_to_client( + const hpx::naming::id_type& client_event); + + private: + hpx::util::function_nonser send_callback; +}; + +//////////////////////////////////////////////////////// +// This class is used for keeping data associated with cl_events alive. +// +class data_map { + typedef hpx::lcos::local::spinlock lock_type; + + public: + // Constructor + HPX_OPENCL_EXPORT data_map(); + HPX_OPENCL_EXPORT ~data_map(); + + ////////////////////////////////////////////////// + /// Local public functions + /// + + // Registers a data chunk to a cl_event + template + void add(cl_event event, + hpx::serialization::serialize_buffer data) { + // Strip the template from the buffer + data_map_entry entry; + entry.set_data(data); - //////////////////////////////////////////////////////// - // This class is used to hide the template parameter from serialize_buffer. - // - class data_map_entry - { - private: - template - static void - send_to_client_impl( hpx::serialization::serialize_buffer data, - const hpx::naming::id_type& event_id ) - { - hpx::set_lco_value(event_id, data, false); - } - - public: - template - void set_data(hpx::serialization::serialize_buffer data) - { - - // The data itself does not need to explicitely get kept alive, - // it gets kept alive inside of the bind. - - send_callback = hpx::util::bind(&send_to_client_impl, data, - hpx::util::placeholders::_1); - } - - // Sends the data to the client event (to trigger client future) - HPX_OPENCL_EXPORT void send_data_to_client(const hpx::naming::id_type& client_event); - - private: - hpx::util::function_nonser - send_callback; - }; - - - //////////////////////////////////////////////////////// - // This class is used for keeping data associated with cl_events alive. - // - class data_map { - typedef hpx::lcos::local::spinlock lock_type; - public: - // Constructor - HPX_OPENCL_EXPORT data_map(); - HPX_OPENCL_EXPORT ~data_map(); - - ////////////////////////////////////////////////// - /// Local public functions - /// - - // Registers a data chunk to a cl_event - template - void add( cl_event event, - hpx::serialization::serialize_buffer data ) - { - // Strip the template from the buffer - data_map_entry entry; - entry.set_data(data); - - { - // Lock the map - std::lock_guard lock(this->m); - - // Insert the data into the map - map.insert(std::move( - map_type::value_type(event, std::move(entry)) - )); - } - } - - // Returns the data entry associated with the event. - // Undefined behaviour if no data is available. - HPX_OPENCL_EXPORT data_map_entry get(cl_event event); - - // Returns bool if data is registered, and false if not - HPX_OPENCL_EXPORT bool has_data(cl_event event); - - // Deletes the data - HPX_OPENCL_EXPORT void remove(cl_event event); - - private: - /////////////////////////////////////////////// - // Private Member Variables - // - - // The actual internal datastructure - typedef std::map map_type; - map_type map; - - // Lock for synchronization - hpx::compat::mutex m; - - - }; -}}}} + // Lock the map + std::lock_guard lock(this->m); + + // Insert the data into the map + map.insert(std::move(map_type::value_type(event, std::move(entry)))); + } + } + + // Returns the data entry associated with the event. + // Undefined behaviour if no data is available. + HPX_OPENCL_EXPORT data_map_entry get(cl_event event); + + // Returns bool if data is registered, and false if not + HPX_OPENCL_EXPORT bool has_data(cl_event event); + + // Deletes the data + HPX_OPENCL_EXPORT void remove(cl_event event); + + private: + /////////////////////////////////////////////// + // Private Member Variables + // + + // The actual internal datastructure + typedef std::map map_type; + map_type map; + + // Lock for synchronization + hpx::compat::mutex m; +}; +} // namespace util +} // namespace server +} // namespace opencl +} // namespace hpx #endif diff --git a/opencl/server/util/event_dependencies.cpp b/opencl/server/util/event_dependencies.cpp index 4371a9e8..dc943c78 100644 --- a/opencl/server/util/event_dependencies.cpp +++ b/opencl/server/util/event_dependencies.cpp @@ -9,44 +9,24 @@ using hpx::opencl::server::util::event_dependencies; -event_dependencies:: -event_dependencies(const std::vector & event_ids, - hpx::opencl::server::device* parent_device) -{ - if(event_ids.size() != 0){ - - events.reserve(event_ids.size()); - - for(const auto & id : event_ids){ - events.push_back( parent_device->retrieve_event(id) ); - } - +event_dependencies::event_dependencies( + const std::vector& event_ids, + hpx::opencl::server::device* parent_device) { + if (event_ids.size() != 0) { + events.reserve(event_ids.size()); + + for (const auto& id : event_ids) { + events.push_back(parent_device->retrieve_event(id)); } + } } -event_dependencies:: -~event_dependencies() -{ - -} - -std::size_t -event_dependencies:: -size() -{ - - return events.size(); - -} +event_dependencies::~event_dependencies() {} -cl_event* -event_dependencies:: -get_cl_events() -{ +std::size_t event_dependencies::size() { return events.size(); } - if(events.size() == 0) - return NULL; - - return events.data(); +cl_event* event_dependencies::get_cl_events() { + if (events.size() == 0) return NULL; + return events.data(); } diff --git a/opencl/server/util/event_dependencies.hpp b/opencl/server/util/event_dependencies.hpp index 76a2965b..f6ba3c05 100644 --- a/opencl/server/util/event_dependencies.hpp +++ b/opencl/server/util/event_dependencies.hpp @@ -14,37 +14,40 @@ #include "../../fwd_declarations.hpp" //////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl{ namespace server{ namespace util{ - - - //////////////////////////////////////////////////////// - // This class is used to convert event ids to cl_events - // - class HPX_OPENCL_EXPORT event_dependencies - { - public: - // Constructor - event_dependencies(const std::vector & event_ids, - hpx::opencl::server::device* parent_device); - ~event_dependencies(); - - ////////////////////////////////////////////////// - /// Local public functions - /// - - // Returns a pointer to a list of cl_events. Ensure that deallocation - // of this class only happens when this pointer is not needed any more! - // - // Returns NULL if size() == 0 - cl_event* get_cl_events(); - - // Returns the number of events in this list - std::size_t size(); - - private: - std::vector events; - - }; -}}}} +namespace hpx { +namespace opencl { +namespace server { +namespace util { + +//////////////////////////////////////////////////////// +// This class is used to convert event ids to cl_events +// +class HPX_OPENCL_EXPORT event_dependencies { + public: + // Constructor + event_dependencies(const std::vector& event_ids, + hpx::opencl::server::device* parent_device); + ~event_dependencies(); + + ////////////////////////////////////////////////// + /// Local public functions + /// + + // Returns a pointer to a list of cl_events. Ensure that deallocation + // of this class only happens when this pointer is not needed any more! + // + // Returns NULL if size() == 0 + cl_event* get_cl_events(); + + // Returns the number of events in this list + std::size_t size(); + + private: + std::vector events; +}; +} // namespace util +} // namespace server +} // namespace opencl +} // namespace hpx #endif diff --git a/opencl/server/util/event_map.cpp b/opencl/server/util/event_map.cpp index a4d9fdcb..0ba22236 100644 --- a/opencl/server/util/event_map.cpp +++ b/opencl/server/util/event_map.cpp @@ -10,139 +10,119 @@ using hpx::opencl::server::util::event_map; -event_map::event_map() -{ -} +event_map::event_map() {} -event_map::~event_map(){ - // Correct use removes all entries from this map before deletion - HPX_ASSERT(events.empty()); - HPX_ASSERT(waits.empty()); +event_map::~event_map() { + // Correct use removes all entries from this map before deletion + HPX_ASSERT(events.empty()); + HPX_ASSERT(waits.empty()); } +void event_map::add(const hpx::naming::id_type& gid, cl_event event) { + hpx::naming::gid_type key = gid.get_gid(); -void -event_map::add(const hpx::naming::id_type & gid, cl_event event){ - hpx::naming::gid_type key = gid.get_gid(); - - { - // Lock - std::lock_guard l(lock); + { + // Lock + std::lock_guard l(lock); - // Insert - events.insert(map_type::value_type(key, event)); - HPX_ASSERT(events.at(key) == event); + // Insert + events.insert(map_type::value_type(key, event)); + HPX_ASSERT(events.at(key) == event); - // Retrieve end delete condition variable if exists - waitmap_type::iterator it = waits.find(key); - if(it != waits.end()){ - // Notify waiting threads - it->second->notify_all(); - waits.erase(it); - } + // Retrieve end delete condition variable if exists + waitmap_type::iterator it = waits.find(key); + if (it != waits.end()) { + // Notify waiting threads + it->second->notify_all(); + waits.erase(it); } - + } } -cl_event -event_map::get(const hpx::naming::id_type& id) -{ - - return this->get(id.get_gid()); - +cl_event event_map::get(const hpx::naming::id_type& id) { + return this->get(id.get_gid()); } -cl_event -event_map::get(const hpx::naming::gid_type& key){ - - cl_event result = NULL; - { - map_type::iterator it; +cl_event event_map::get(const hpx::naming::gid_type& key) { + cl_event result = NULL; + { + map_type::iterator it; - // Lock - std::lock_guard l(lock); + // Lock + std::lock_guard l(lock); - // Try to retrieve - it = events.find(key); + // Try to retrieve + it = events.find(key); - // On success, return - if(it != events.end()){ - result = it->second; - } + // On success, return + if (it != events.end()) { + result = it->second; } + } - // Return if event found - if(result) return result; - - // On failure, try again and register callback - waitmap_type::value_type waits_entry(key, std::make_shared()); - { - map_type::iterator it; - - // Lock - std::unique_lock l(lock); + // Return if event found + if (result) return result; - // Try to retrieve - it = events.find(key); + // On failure, try again and register callback + waitmap_type::value_type waits_entry(key, std::make_shared()); + { + map_type::iterator it; - // On success, return - if(it != events.end()){ - return it->second; - } + // Lock + std::unique_lock l(lock); - // On failure, register condition variable (or retrieve existing one) - auto inserted_condvar = waits.insert(std::move(waits_entry)); + // Try to retrieve + it = events.find(key); - // Unwrap the condition variable - std::shared_ptr condition - = inserted_condvar.first->second; + // On success, return + if (it != events.end()) { + return it->second; + } - // Wait for some other thread to add() the missing key - condition->wait(l); + // On failure, register condition variable (or retrieve existing one) + auto inserted_condvar = waits.insert(std::move(waits_entry)); - // This should now definitely return the requested item. - it = events.find(key); - HPX_ASSERT(it != events.end()); + // Unwrap the condition variable + std::shared_ptr condition = inserted_condvar.first->second; - return it->second; + // Wait for some other thread to add() the missing key + condition->wait(l); - } + // This should now definitely return the requested item. + it = events.find(key); + HPX_ASSERT(it != events.end()); + return it->second; + } } -void -event_map::remove(const hpx::naming::gid_type &gid) -{ - - cl_event event; - { - // Lock - std::lock_guard l(lock); - - // Find Element - auto it = events.find(gid); - HPX_ASSERT(it != events.end()); +void event_map::remove(const hpx::naming::gid_type& gid) { + cl_event event; + { + // Lock + std::lock_guard l(lock); - // Unwrap event - event = it->second; - } + // Find Element + auto it = events.find(gid); + HPX_ASSERT(it != events.end()); - // run deletion callback - deletion_callback(event); + // Unwrap event + event = it->second; + } - { - // Lock - std::lock_guard l(lock); + // run deletion callback + deletion_callback(event); - // Remove element - events.erase(gid); - } + { + // Lock + std::lock_guard l(lock); + // Remove element + events.erase(gid); + } } -void -event_map::register_deletion_callback(std::function && callback){ - this->deletion_callback = callback; +void event_map::register_deletion_callback( + std::function&& callback) { + this->deletion_callback = callback; } - - diff --git a/opencl/server/util/event_map.hpp b/opencl/server/util/event_map.hpp index cbb9ca55..fb75f5d3 100644 --- a/opencl/server/util/event_map.hpp +++ b/opencl/server/util/event_map.hpp @@ -14,71 +14,74 @@ #include "../../cl_headers.hpp" //////////////////////////////////////////////////////////////// -namespace hpx { namespace opencl{ namespace server{ namespace util{ - - - //////////////////////////////////////////////////////// - // This class is used for the mapping between gid's and cl_events. - // - class event_map - { - typedef hpx::lcos::local::spinlock lock_type; - typedef hpx::lcos::local::condition_variable_any condition_type; - - public: - // Constructor - HPX_OPENCL_EXPORT event_map(); - HPX_OPENCL_EXPORT ~event_map(); - - ////////////////////////////////////////////////// - /// Local public functions - /// - - // Adds a new gid-cl_event pair - // !! add does not have any sequential consistency guarantee to get(). - // It might happen that get() gets called before the referencing GID - // is registered with add(). - // (=> Fancy synchronization needed inside of event_map) - HPX_OPENCL_EXPORT void add(const hpx::naming::id_type&, cl_event); - - // Retrieves the cl_event associated with the gid. - // !! BLOCKS if gid is not present until gid gets added - // with 'add()'. - HPX_OPENCL_EXPORT cl_event get(const hpx::naming::id_type&); - HPX_OPENCL_EXPORT cl_event get(const hpx::naming::gid_type&); - - // Registers a function that will get called upon gid removal - // (Used to delete associated cl_event) - HPX_OPENCL_EXPORT void register_deletion_callback(std::function &&); - - // Removes a GID. - // !! This function is the only one of this class with a consistency - // guarantee. remove() will ALWAYS be called AFTER all other calls - // involving the given GID are finished. (i.e. add() and get()) - HPX_OPENCL_EXPORT void remove(const hpx::naming::gid_type&); - - private: - /////////////////////////////////////////////// - // Private Member Variables - // - - // The actual internal datastructure - typedef std::map - map_type; - map_type events; - - // Threads that called get() and are waiting for a corresponding add() - typedef std::map > - waitmap_type; - waitmap_type waits; - - // Lock for synchronization - lock_type lock; - - // Callback function for cl_event cleanup - std::function deletion_callback; - - }; -}}}} +namespace hpx { +namespace opencl { +namespace server { +namespace util { + +//////////////////////////////////////////////////////// +// This class is used for the mapping between gid's and cl_events. +// +class event_map { + typedef hpx::lcos::local::spinlock lock_type; + typedef hpx::lcos::local::condition_variable_any condition_type; + + public: + // Constructor + HPX_OPENCL_EXPORT event_map(); + HPX_OPENCL_EXPORT ~event_map(); + + ////////////////////////////////////////////////// + /// Local public functions + /// + + // Adds a new gid-cl_event pair + // !! add does not have any sequential consistency guarantee to get(). + // It might happen that get() gets called before the referencing GID + // is registered with add(). + // (=> Fancy synchronization needed inside of event_map) + HPX_OPENCL_EXPORT void add(const hpx::naming::id_type &, cl_event); + + // Retrieves the cl_event associated with the gid. + // !! BLOCKS if gid is not present until gid gets added + // with 'add()'. + HPX_OPENCL_EXPORT cl_event get(const hpx::naming::id_type &); + HPX_OPENCL_EXPORT cl_event get(const hpx::naming::gid_type &); + + // Registers a function that will get called upon gid removal + // (Used to delete associated cl_event) + HPX_OPENCL_EXPORT void register_deletion_callback( + std::function &&); + + // Removes a GID. + // !! This function is the only one of this class with a consistency + // guarantee. remove() will ALWAYS be called AFTER all other calls + // involving the given GID are finished. (i.e. add() and get()) + HPX_OPENCL_EXPORT void remove(const hpx::naming::gid_type &); + + private: + /////////////////////////////////////////////// + // Private Member Variables + // + + // The actual internal datastructure + typedef std::map map_type; + map_type events; + + // Threads that called get() and are waiting for a corresponding add() + typedef std::map > + waitmap_type; + waitmap_type waits; + + // Lock for synchronization + lock_type lock; + + // Callback function for cl_event cleanup + std::function deletion_callback; +}; +} // namespace util +} // namespace server +} // namespace opencl +} // namespace hpx #endif diff --git a/opencl/server/util/hpx_cl_interop.cpp b/opencl/server/util/hpx_cl_interop.cpp index c56485b5..99b9ea88 100644 --- a/opencl/server/util/hpx_cl_interop.cpp +++ b/opencl/server/util/hpx_cl_interop.cpp @@ -14,80 +14,66 @@ static std::atomic opencl_thread_num(0); // This function triggers an hpx::lcos::local::event from an external thread -void -hpx::opencl::server::util -::set_promise_from_external( hpx::runtime * rt, - hpx::lcos::local::promise * promise, - cl_int value ) -{ - - // If we are on an hpx thread we don't need any special treatment - if(rt->get_thread_name() != "") - { - promise->set_value(value); - return; - } - - // if we're on an OS thread, register it temporarily. - // add the thread id to its name, as there could potentially - // be multiple OpenCL threads in this function at the same time - rt->register_thread("opencl", - opencl_thread_num.fetch_add(1, - std::memory_order_relaxed), - false); - //BOOST_ASSERT(succeeded); - - // trigger the event lock +void hpx::opencl::server::util ::set_promise_from_external( + hpx::runtime* rt, hpx::lcos::local::promise* promise, + cl_int value) { + // If we are on an hpx thread we don't need any special treatment + if (rt->get_thread_name() != "") { promise->set_value(value); - - // unregister the thread from hpx as we don't have any control over it - // any more. ever. (probably) - // /* this line is currently commented out. - // * is unregistering necessary? - // * there should be huge speed improvements if we don't unregister. - // * although it would be a potential memory leak. - // * but it would be kind of a memory leak as well, if we register - // * every single callback-call on a different thread name ... - // */ - //rt->unregister_thread(); - + return; + } + + // if we're on an OS thread, register it temporarily. + // add the thread id to its name, as there could potentially + // be multiple OpenCL threads in this function at the same time + rt->register_thread("opencl", + opencl_thread_num.fetch_add(1, std::memory_order_relaxed), + false); + // BOOST_ASSERT(succeeded); + + // trigger the event lock + promise->set_value(value); + + // unregister the thread from hpx as we don't have any control over it + // any more. ever. (probably) + // /* this line is currently commented out. + // * is unregistering necessary? + // * there should be huge speed improvements if we don't unregister. + // * although it would be a potential memory leak. + // * but it would be kind of a memory leak as well, if we register + // * every single callback-call on a different thread name ... + // */ + // rt->unregister_thread(); } // This function triggers an hpx::lcos::local::event from an external thread -void -hpx::opencl::server::util -::set_promise_from_external( hpx::runtime * rt, - hpx::lcos::local::promise * promise ) -{ - - // If we are on an hpx thread we don't need any special treatment - if(rt->get_thread_name() != "") - { - promise->set_value(); - return; - } - - // if we're on an OS thread, register it temporarily. - // add the thread id to its name, as there could potentially - // be multiple OpenCL threads in this function at the same time - rt->register_thread("opencl", - opencl_thread_num.fetch_add(1, - std::memory_order_relaxed), - false); - //BOOST_ASSERT(succeeded); - - // trigger the event lock +void hpx::opencl::server::util ::set_promise_from_external( + hpx::runtime* rt, hpx::lcos::local::promise* promise) { + // If we are on an hpx thread we don't need any special treatment + if (rt->get_thread_name() != "") { promise->set_value(); - - // unregister the thread from hpx as we don't have any control over it - // any more. ever. (probably) - // /* this line is currently commented out. - // * is unregistering necessary? - // * there should be huge speed improvements if we don't unregister. - // * although it would be a potential memory leak. - // * but it would be kind of a memory leak as well, if we register - // * every single callback-call on a different thread name ... - // */ - //rt->unregister_thread(); - + return; + } + + // if we're on an OS thread, register it temporarily. + // add the thread id to its name, as there could potentially + // be multiple OpenCL threads in this function at the same time + rt->register_thread("opencl", + opencl_thread_num.fetch_add(1, std::memory_order_relaxed), + false); + // BOOST_ASSERT(succeeded); + + // trigger the event lock + promise->set_value(); + + // unregister the thread from hpx as we don't have any control over it + // any more. ever. (probably) + // /* this line is currently commented out. + // * is unregistering necessary? + // * there should be huge speed improvements if we don't unregister. + // * although it would be a potential memory leak. + // * but it would be kind of a memory leak as well, if we register + // * every single callback-call on a different thread name ... + // */ + // rt->unregister_thread(); } diff --git a/opencl/server/util/hpx_cl_interop.hpp b/opencl/server/util/hpx_cl_interop.hpp index e40cb91b..2fe18e7f 100644 --- a/opencl/server/util/hpx_cl_interop.hpp +++ b/opencl/server/util/hpx_cl_interop.hpp @@ -9,16 +9,21 @@ #include "../../cl_headers.hpp" -namespace hpx { namespace opencl { namespace server { namespace util { +namespace hpx { +namespace opencl { +namespace server { +namespace util { // This function triggers an hpx::lcos::local::event from an external thread -void set_promise_from_external( hpx::runtime * rt, - hpx::lcos::local::promise * promise, - cl_int value ); +void set_promise_from_external(hpx::runtime* rt, + hpx::lcos::local::promise* promise, + cl_int value); // This function triggers an hpx::lcos::local::event from an external thread -void set_promise_from_external( hpx::runtime * rt, - hpx::lcos::local::promise * promise ); +void set_promise_from_external(hpx::runtime* rt, + hpx::lcos::local::promise* promise); - -}}}} +} // namespace util +} // namespace server +} // namespace opencl +} // namespace hpx diff --git a/opencl/server/util/server_definitions.hpp b/opencl/server/util/server_definitions.hpp index 4f455bca..fb98d62f 100644 --- a/opencl/server/util/server_definitions.hpp +++ b/opencl/server/util/server_definitions.hpp @@ -7,29 +7,26 @@ #ifndef HPX_OPENCL_SERVER_UTIL_SERVER_DEFINITIONS_HPP_ #define HPX_OPENCL_SERVER_UTIL_SERVER_DEFINITIONS_HPP_ -#define HPX_OPENCL_REGISTER_ACTION_DECLARATION(component_name, action_name) \ - HPX_ACTION_USES_MEDIUM_STACK( \ - hpx::opencl::server::component_name::action_name##_action); \ - HPX_REGISTER_ACTION_DECLARATION( \ - hpx::opencl::server::component_name::action_name##_action, \ - hpx_opencl_##component_name##_##action_name##_action) - -#define HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(component_name, action_name)\ -namespace hpx { namespace traits \ -{ \ - template \ - struct action_stacksize< \ - hpx::opencl::server::component_name::action_name##_action, \ - typename util::always_void< \ - typename \ - hpx::opencl::server::component_name::action_name##_action::type \ - >::type \ - > \ - { \ - enum { value = hpx::threads::thread_stacksize_medium }; \ - }; \ -}} - +#define HPX_OPENCL_REGISTER_ACTION_DECLARATION(component_name, action_name) \ + HPX_ACTION_USES_MEDIUM_STACK( \ + hpx::opencl::server::component_name::action_name##_action); \ + HPX_REGISTER_ACTION_DECLARATION( \ + hpx::opencl::server::component_name::action_name##_action, \ + hpx_opencl_##component_name##_##action_name##_action) +#define HPX_OPENCL_TEMPLATE_ACTION_USES_MEDIUM_STACK(component_name, \ + action_name) \ + namespace hpx { \ + namespace traits { \ + template \ + struct action_stacksize< \ + hpx::opencl::server::component_name::action_name##_action, \ + typename util::always_void< \ + typename hpx::opencl::server::component_name::action_name##_action< \ + T>::type>::type> { \ + enum { value = hpx::threads::thread_stacksize_medium }; \ + }; \ + } \ + } #endif diff --git a/opencl/tools.cpp b/opencl/tools.cpp index 1b575ef3..ae55d5ce 100644 --- a/opencl/tools.cpp +++ b/opencl/tools.cpp @@ -9,95 +9,152 @@ // Dependencies #include -namespace hpx { namespace opencl { namespace tools { +namespace hpx { +namespace opencl { +namespace tools { -bool runs_on_medium_stack() -{ +bool runs_on_medium_stack() { + // Get current stack size + std::size_t current_stack_size = hpx::threads::get_ctx_ptr()->get_stacksize(); - // Get current stack size - std::size_t current_stack_size = hpx::threads::get_ctx_ptr()->get_stacksize(); + // Get large stack size + std::size_t medium_stack_size = + hpx::get_runtime().get_config().get_stack_size( + hpx::threads::thread_stacksize_medium); - // Get large stack size - std::size_t medium_stack_size = - hpx::get_runtime().get_config().get_stack_size( - hpx::threads::thread_stacksize_medium); - - /* - hpx::cout << "Stack: " << std::hex << current_stack_size << " / " - << std::hex << large_stack_size << hpx::endl; - */ - - return current_stack_size == medium_stack_size; + /* + hpx::cout << "Stack: " << std::hex << current_stack_size << " / " + << std::hex << large_stack_size << hpx::endl; + */ + return current_stack_size == medium_stack_size; } -const char* cl_err_to_str(cl_int errCode) -{ - switch(errCode) - { - case 0 : return "CL_SUCCESS"; - case -1 : return "CL_DEVICE_NOT_FOUND"; - case -2 : return "CL_DEVICE_NOT_AVAILABLE"; - case -3 : return "CL_COMPILER_NOT_AVAILABLE"; - case -4 : return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; - case -5 : return "CL_OUT_OF_RESOURCES"; - case -6 : return "CL_OUT_OF_HOST_MEMORY"; - case -7 : return "CL_PROFILING_INFO_NOT_AVAILABLE"; - case -8 : return "CL_MEM_COPY_OVERLAP"; - case -9 : return "CL_IMAGE_FORMAT_MISMATCH"; - case -10: return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; - case -11: return "CL_BUILD_PROGRAM_FAILURE"; - case -12: return "CL_MAP_FAILURE"; - case -13: return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; - case -14: return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; - case -15: return "CL_COMPILE_PROGRAM_FAILURE"; - case -16: return "CL_LINKER_NOT_AVAILABLE"; - case -17: return "CL_LINK_PROGRAM_FAILURE"; - case -18: return "CL_DEVICE_PARTITION_FAILED"; - case -19: return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; - case -30: return "CL_INVALID_VALUE"; - case -31: return "CL_INVALID_DEVICE_TYPE"; - case -32: return "CL_INVALID_PLATFORM"; - case -33: return "CL_INVALID_DEVICE"; - case -34: return "CL_INVALID_CONTEXT"; - case -35: return "CL_INVALID_QUEUE_PROPERTIES"; - case -36: return "CL_INVALID_COMMAND_QUEUE"; - case -37: return "CL_INVALID_HOST_PTR"; - case -38: return "CL_INVALID_MEM_OBJECT"; - case -39: return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; - case -40: return "CL_INVALID_IMAGE_SIZE"; - case -41: return "CL_INVALID_SAMPLER"; - case -42: return "CL_INVALID_BINARY"; - case -43: return "CL_INVALID_BUILD_OPTIONS"; - case -44: return "CL_INVALID_PROGRAM"; - case -45: return "CL_INVALID_PROGRAM_EXECUTABLE"; - case -46: return "CL_INVALID_KERNEL_NAME"; - case -47: return "CL_INVALID_KERNEL_DEFINITION"; - case -48: return "CL_INVALID_KERNEL"; - case -49: return "CL_INVALID_ARG_INDEX"; - case -50: return "CL_INVALID_ARG_VALUE"; - case -51: return "CL_INVALID_ARG_SIZE"; - case -52: return "CL_INVALID_KERNEL_ARGS"; - case -53: return "CL_INVALID_WORK_DIMENSION"; - case -54: return "CL_INVALID_WORK_GROUP_SIZE"; - case -55: return "CL_INVALID_WORK_ITEM_SIZE"; - case -56: return "CL_INVALID_GLOBAL_OFFSET"; - case -57: return "CL_INVALID_EVENT_WAIT_LIST"; - case -58: return "CL_INVALID_EVENT"; - case -59: return "CL_INVALID_OPERATION "; - case -60: return "CL_INVALID_GL_OBJECT"; - case -61: return "CL_INVALID_BUFFER_SIZE"; - case -62: return "CL_INVALID_MIP_LEVEL"; - case -63: return "CL_INVALID_GLOBAL_WORK_SIZE"; - case -64: return "CL_INVALID_PROPERTY"; - case -65: return "CL_INVALID_IMAGE_DESCRIPTOR"; - case -66: return "CL_INVALID_COMPILER_OPTIONS"; - case -67: return "CL_INVALID_LINKER_OPTIONS"; - case -68: return "CL_INVALID_DEVICE_PARTITION_COUNT"; - default : return "UNKNOWN ERROR"; - } +const char* cl_err_to_str(cl_int errCode) { + switch (errCode) { + case 0: + return "CL_SUCCESS"; + case -1: + return "CL_DEVICE_NOT_FOUND"; + case -2: + return "CL_DEVICE_NOT_AVAILABLE"; + case -3: + return "CL_COMPILER_NOT_AVAILABLE"; + case -4: + return "CL_MEM_OBJECT_ALLOCATION_FAILURE"; + case -5: + return "CL_OUT_OF_RESOURCES"; + case -6: + return "CL_OUT_OF_HOST_MEMORY"; + case -7: + return "CL_PROFILING_INFO_NOT_AVAILABLE"; + case -8: + return "CL_MEM_COPY_OVERLAP"; + case -9: + return "CL_IMAGE_FORMAT_MISMATCH"; + case -10: + return "CL_IMAGE_FORMAT_NOT_SUPPORTED"; + case -11: + return "CL_BUILD_PROGRAM_FAILURE"; + case -12: + return "CL_MAP_FAILURE"; + case -13: + return "CL_MISALIGNED_SUB_BUFFER_OFFSET"; + case -14: + return "CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST"; + case -15: + return "CL_COMPILE_PROGRAM_FAILURE"; + case -16: + return "CL_LINKER_NOT_AVAILABLE"; + case -17: + return "CL_LINK_PROGRAM_FAILURE"; + case -18: + return "CL_DEVICE_PARTITION_FAILED"; + case -19: + return "CL_KERNEL_ARG_INFO_NOT_AVAILABLE"; + case -30: + return "CL_INVALID_VALUE"; + case -31: + return "CL_INVALID_DEVICE_TYPE"; + case -32: + return "CL_INVALID_PLATFORM"; + case -33: + return "CL_INVALID_DEVICE"; + case -34: + return "CL_INVALID_CONTEXT"; + case -35: + return "CL_INVALID_QUEUE_PROPERTIES"; + case -36: + return "CL_INVALID_COMMAND_QUEUE"; + case -37: + return "CL_INVALID_HOST_PTR"; + case -38: + return "CL_INVALID_MEM_OBJECT"; + case -39: + return "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR"; + case -40: + return "CL_INVALID_IMAGE_SIZE"; + case -41: + return "CL_INVALID_SAMPLER"; + case -42: + return "CL_INVALID_BINARY"; + case -43: + return "CL_INVALID_BUILD_OPTIONS"; + case -44: + return "CL_INVALID_PROGRAM"; + case -45: + return "CL_INVALID_PROGRAM_EXECUTABLE"; + case -46: + return "CL_INVALID_KERNEL_NAME"; + case -47: + return "CL_INVALID_KERNEL_DEFINITION"; + case -48: + return "CL_INVALID_KERNEL"; + case -49: + return "CL_INVALID_ARG_INDEX"; + case -50: + return "CL_INVALID_ARG_VALUE"; + case -51: + return "CL_INVALID_ARG_SIZE"; + case -52: + return "CL_INVALID_KERNEL_ARGS"; + case -53: + return "CL_INVALID_WORK_DIMENSION"; + case -54: + return "CL_INVALID_WORK_GROUP_SIZE"; + case -55: + return "CL_INVALID_WORK_ITEM_SIZE"; + case -56: + return "CL_INVALID_GLOBAL_OFFSET"; + case -57: + return "CL_INVALID_EVENT_WAIT_LIST"; + case -58: + return "CL_INVALID_EVENT"; + case -59: + return "CL_INVALID_OPERATION "; + case -60: + return "CL_INVALID_GL_OBJECT"; + case -61: + return "CL_INVALID_BUFFER_SIZE"; + case -62: + return "CL_INVALID_MIP_LEVEL"; + case -63: + return "CL_INVALID_GLOBAL_WORK_SIZE"; + case -64: + return "CL_INVALID_PROPERTY"; + case -65: + return "CL_INVALID_IMAGE_DESCRIPTOR"; + case -66: + return "CL_INVALID_COMPILER_OPTIONS"; + case -67: + return "CL_INVALID_LINKER_OPTIONS"; + case -68: + return "CL_INVALID_DEVICE_PARTITION_COUNT"; + default: + return "UNKNOWN ERROR"; + } } - -}}} - +} // namespace tools +} // namespace opencl +} // namespace hpx diff --git a/opencl/tools.hpp b/opencl/tools.hpp index a24e7667..0561057f 100644 --- a/opencl/tools.hpp +++ b/opencl/tools.hpp @@ -20,53 +20,47 @@ #error "OpenCL 1.1 required!" #endif -namespace hpx { namespace opencl { namespace tools { - - // Used to disable the empty constructor of classes - #define CL_FORBID_EMPTY_CONSTRUCTOR(classname) \ - classname::classname() \ - { \ - HPX_THROW_EXCEPTION(hpx::no_success, #classname "()", \ - "Empty constructor is not defined!"); \ - } - - // To be called on OpenCL errorcodes, throws an exception on OpenCL Error - #define cl_ensure(errCode, functionname){ \ - if(errCode != CL_SUCCESS) \ - { \ - std::stringstream errorMessage; \ - errorMessage << "CL_ERROR(" \ - << (errCode) \ - << "): " \ - << hpx::opencl::tools::cl_err_to_str(errCode);\ - HPX_THROW_EXCEPTION(hpx::no_success, \ - (functionname), \ - errorMessage.str()); \ - } \ - } - - // To be called on OpenCL errorcodes in destructors, does not throw - #define cl_ensure_nothrow(errCode, functionname){ \ - if(errCode != CL_SUCCESS) \ - { \ - hpx::cerr << (functionname) \ - << ": CL_ERROR(" \ - << (errCode) \ - << "): " \ - << hpx::opencl::tools::cl_err_to_str(errCode) \ - << hpx::endl; \ - } \ - } - - - // Translates CL errorcode to descriptive string - HPX_OPENCL_EXPORT const char* cl_err_to_str(cl_int errCode); - - // Returns true if curren thread runs on a large stack - HPX_OPENCL_EXPORT bool runs_on_medium_stack(); - -}}} - -#endif//HPX_OPENCL_TOOLS_HPP_ - - +namespace hpx { +namespace opencl { +namespace tools { + +// Used to disable the empty constructor of classes +#define CL_FORBID_EMPTY_CONSTRUCTOR(classname) \ + classname::classname() { \ + HPX_THROW_EXCEPTION(hpx::no_success, #classname "()", \ + "Empty constructor is not defined!"); \ + } + +// To be called on OpenCL errorcodes, throws an exception on OpenCL Error +#define cl_ensure(errCode, functionname) \ + { \ + if (errCode != CL_SUCCESS) { \ + std::stringstream errorMessage; \ + errorMessage << "CL_ERROR(" << (errCode) \ + << "): " << hpx::opencl::tools::cl_err_to_str(errCode); \ + HPX_THROW_EXCEPTION(hpx::no_success, (functionname), \ + errorMessage.str()); \ + } \ + } + +// To be called on OpenCL errorcodes in destructors, does not throw +#define cl_ensure_nothrow(errCode, functionname) \ + { \ + if (errCode != CL_SUCCESS) { \ + hpx::cerr << (functionname) << ": CL_ERROR(" << (errCode) \ + << "): " << hpx::opencl::tools::cl_err_to_str(errCode) \ + << hpx::endl; \ + } \ + } + +// Translates CL errorcode to descriptive string +HPX_OPENCL_EXPORT const char* cl_err_to_str(cl_int errCode); + +// Returns true if curren thread runs on a large stack +HPX_OPENCL_EXPORT bool runs_on_medium_stack(); + +} // namespace tools +} // namespace opencl +} // namespace hpx + +#endif // HPX_OPENCL_TOOLS_HPP_ diff --git a/opencl/util/enqueue_overloads.cpp b/opencl/util/enqueue_overloads.cpp index d29a1ef6..96814bca 100644 --- a/opencl/util/enqueue_overloads.cpp +++ b/opencl/util/enqueue_overloads.cpp @@ -5,9 +5,6 @@ #include "enqueue_overloads.hpp" -void -hpx::opencl::util::enqueue_overloads::resolver_impl( - hpx::naming::gid_type device_id, - std::vector&, - std::vector&){ -}; +void hpx::opencl::util::enqueue_overloads::resolver_impl( + hpx::naming::gid_type device_id, std::vector&, + std::vector&){}; diff --git a/opencl/util/enqueue_overloads.hpp b/opencl/util/enqueue_overloads.hpp index 76f7f87d..75205360 100644 --- a/opencl/util/enqueue_overloads.hpp +++ b/opencl/util/enqueue_overloads.hpp @@ -14,166 +14,148 @@ #include "../lcos/event.hpp" -namespace hpx { namespace opencl { namespace util -{ - struct resolved_events - { - public: - std::vector event_ids; - std::vector device_ids; - bool are_from_device(const hpx::naming::id_type& device_id) - { - hpx::naming::gid_type device_gid = device_id.get_gid(); - for(const auto& id : device_ids){ - if(device_gid != id) - return false; - } - return true; - } - bool are_from_devices( const hpx::naming::id_type& device1, - const hpx::naming::id_type& device2 ) - { - hpx::naming::gid_type device_gid1 = device1.get_gid(); - hpx::naming::gid_type device_gid2 = device2.get_gid(); - for(const auto& id : device_ids){ - if((device_gid1 != id) && (device_gid2 != id)) - return false; - } - return true; - } - }; -}}} - -namespace hpx { namespace opencl { namespace util { namespace enqueue_overloads -{ - // TODO implement check for correct device - // This is the function that actually extrudes the GID from the futures. - template - hpx::naming::id_type - extrude_id(const Future & fut, hpx::naming::gid_type& device_id) - { - typedef typename std::remove_reference::type::result_type - result_type; - typedef typename hpx::opencl::lcos::event::shared_state_type - event_type; - - auto shared_state = hpx::traits::detail::get_shared_state(fut); - - HPX_ASSERT(boost::dynamic_pointer_cast(shared_state).get()); - auto ev = boost::static_pointer_cast(shared_state); - - std::stringstream str; - str << "devide_id=" << device_id << " and ev_id=" << ev->get_device_gid(); - HPX_ASSERT_MSG(device_id == ev->get_device_gid(), str.str().c_str()); - - auto event_id = ev->get_event_id(); - return event_id; +namespace hpx { +namespace opencl { +namespace util { +struct resolved_events { + public: + std::vector event_ids; + std::vector device_ids; + bool are_from_device(const hpx::naming::id_type& device_id) { + hpx::naming::gid_type device_gid = device_id.get_gid(); + for (const auto& id : device_ids) { + if (device_gid != id) return false; } - - namespace detail - { - BOOST_MPL_HAS_XXX_TRAIT_DEF(value_type) - BOOST_MPL_HAS_XXX_TRAIT_DEF(iterator) - BOOST_MPL_HAS_XXX_TRAIT_DEF(size_type) - BOOST_MPL_HAS_XXX_TRAIT_DEF(reference) - - template - struct is_container - : boost::mpl::bool_< - has_value_type::value && has_iterator::value && - has_size_type::value && has_reference::value> - {}; - - template - struct is_container - : is_container - {}; - } - - // This function object switches its implementation depending on whether - // the given value is a container or not - template - struct extrude_all_ids - { - }; - - template<> - struct extrude_all_ids - { - template - void - operator()(hpx::naming::gid_type device_id,const T & t, - std::vector &event_ids, - std::vector &device_ids) const - { - //hpx::naming::gid_type device_id; - event_ids.push_back(std::move(extrude_id(t, device_id))); - device_ids.push_back(device_id); - } - }; - - template<> - struct extrude_all_ids - { - template - void - operator()(hpx::naming::gid_type device_id,const std::vector & t_vec, - std::vector &event_ids, - std::vector &device_ids) const - { - for(const T & t : t_vec){ - //hpx::naming::gid_type device_id; - event_ids.push_back(std::move(extrude_id(t, device_id))); - device_ids.push_back(device_id); - } - } - }; - - - // The resolver recursive template functions are here to convert - // an arbitrary number of future and std::vector to - // one single std::vector. - HPX_OPENCL_EXPORT void - resolver_impl(hpx::naming::gid_type device_id,std::vector&, - std::vector&); - - template - void - resolver_impl(hpx::naming::gid_type device_id,std::vector& event_ids, - std::vector& device_ids, - Dep&& dep) - { - extrude_all_ids::value>()(device_id, dep, event_ids, - device_ids); + return true; + } + bool are_from_devices(const hpx::naming::id_type& device1, + const hpx::naming::id_type& device2) { + hpx::naming::gid_type device_gid1 = device1.get_gid(); + hpx::naming::gid_type device_gid2 = device2.get_gid(); + for (const auto& id : device_ids) { + if ((device_gid1 != id) && (device_gid2 != id)) return false; } - - template - void - resolver_impl(hpx::naming::gid_type device_id,std::vector& event_ids, - std::vector& device_ids, - Dep&& dep, Deps&&... deps) - { - // process current dep - extrude_all_ids::value>()(device_id, dep, event_ids, - device_ids ); - - // recursive call - resolver_impl(device_id,event_ids, device_ids, std::forward(deps)...); + return true; + } +}; +} // namespace util +} // namespace opencl +} // namespace hpx + +namespace hpx { +namespace opencl { +namespace util { +namespace enqueue_overloads { +// TODO implement check for correct device +// This is the function that actually extrudes the GID from the futures. +template +hpx::naming::id_type extrude_id(const Future& fut, + hpx::naming::gid_type& device_id) { + typedef typename std::remove_reference::type::result_type result_type; + typedef typename hpx::opencl::lcos::event::shared_state_type + event_type; + + auto shared_state = hpx::traits::detail::get_shared_state(fut); + + HPX_ASSERT(boost::dynamic_pointer_cast(shared_state).get()); + auto ev = boost::static_pointer_cast(shared_state); + + std::stringstream str; + str << "devide_id=" << device_id << " and ev_id=" << ev->get_device_gid(); + HPX_ASSERT_MSG(device_id == ev->get_device_gid(), str.str().c_str()); + + auto event_id = ev->get_event_id(); + return event_id; +} + +namespace detail { +BOOST_MPL_HAS_XXX_TRAIT_DEF(value_type) +BOOST_MPL_HAS_XXX_TRAIT_DEF(iterator) +BOOST_MPL_HAS_XXX_TRAIT_DEF(size_type) +BOOST_MPL_HAS_XXX_TRAIT_DEF(reference) + +template +struct is_container + : boost::mpl::bool_::value && has_iterator::value && + has_size_type::value && has_reference::value> {}; + +template +struct is_container : is_container {}; +} // namespace detail + +// This function object switches its implementation depending on whether +// the given value is a container or not +template +struct extrude_all_ids {}; + +template <> +struct extrude_all_ids { + template + void operator()(hpx::naming::gid_type device_id, const T& t, + std::vector& event_ids, + std::vector& device_ids) const { + // hpx::naming::gid_type device_id; + event_ids.push_back(std::move(extrude_id(t, device_id))); + device_ids.push_back(device_id); + } +}; + +template <> +struct extrude_all_ids { + template + void operator()(hpx::naming::gid_type device_id, const std::vector& t_vec, + std::vector& event_ids, + std::vector& device_ids) const { + for (const T& t : t_vec) { + // hpx::naming::gid_type device_id; + event_ids.push_back(std::move(extrude_id(t, device_id))); + device_ids.push_back(device_id); } - - template - resolved_events - resolver(hpx::naming::gid_type device_id,Deps&&... deps) - { - resolved_events res; - res.event_ids.reserve(sizeof...(deps)); - res.device_ids.reserve(sizeof...(deps)); - resolver_impl(device_id, res.event_ids, res.device_ids, - std::forward(deps)... ); - return res; - } - -}}}} + } +}; + +// The resolver recursive template functions are here to convert +// an arbitrary number of future and std::vector to +// one single std::vector. +HPX_OPENCL_EXPORT void resolver_impl(hpx::naming::gid_type device_id, + std::vector&, + std::vector&); + +template +void resolver_impl(hpx::naming::gid_type device_id, + std::vector& event_ids, + std::vector& device_ids, Dep&& dep) { + extrude_all_ids::value>()(device_id, dep, event_ids, + device_ids); +} + +template +void resolver_impl(hpx::naming::gid_type device_id, + std::vector& event_ids, + std::vector& device_ids, Dep&& dep, + Deps&&... deps) { + // process current dep + extrude_all_ids::value>()(device_id, dep, event_ids, + device_ids); + + // recursive call + resolver_impl(device_id, event_ids, device_ids, std::forward(deps)...); +} + +template +resolved_events resolver(hpx::naming::gid_type device_id, Deps&&... deps) { + resolved_events res; + res.event_ids.reserve(sizeof...(deps)); + res.device_ids.reserve(sizeof...(deps)); + resolver_impl(device_id, res.event_ids, res.device_ids, + std::forward(deps)...); + return res; +} + +} // namespace enqueue_overloads +} // namespace util +} // namespace opencl +} // namespace hpx // #define HPX_OPENCL_GENERATE_ENQUEUE_OVERLOADS(return_value, name, ...) \ // \ diff --git a/opencl/util/generic_buffer.cpp b/opencl/util/generic_buffer.cpp index 2b6bc32e..c73856a1 100644 --- a/opencl/util/generic_buffer.cpp +++ b/opencl/util/generic_buffer.cpp @@ -9,28 +9,20 @@ using hpx::opencl::util::generic_buffer; using hpx::opencl::util::detail::generic_buffer_impl; - -hpx::future -generic_buffer_impl::get(data_type && data) -{ - return data.then( - [] (data_type && data) -> std::string - { - hpx::serialization::serialize_buffer char_array = - data.get(); - - // Calculate length of string. Cut short if it has a 0-Termination - // (Some queries like CL_DEVICE_NAME always return a size of 64, but - // contain a 0-terminated string) - std::size_t length = 0; - while(length < char_array.size()) - { - if(char_array[length] == '\0') break; - length++; - } - - return std::string(char_array.data(), char_array.data() + length); - }); +hpx::future generic_buffer_impl::get( + data_type&& data) { + return data.then([](data_type&& data) -> std::string { + hpx::serialization::serialize_buffer char_array = data.get(); + + // Calculate length of string. Cut short if it has a 0-Termination + // (Some queries like CL_DEVICE_NAME always return a size of 64, but + // contain a 0-terminated string) + std::size_t length = 0; + while (length < char_array.size()) { + if (char_array[length] == '\0') break; + length++; + } + + return std::string(char_array.data(), char_array.data() + length); + }); } - - diff --git a/opencl/util/generic_buffer.hpp b/opencl/util/generic_buffer.hpp index 124c0659..6310cb63 100644 --- a/opencl/util/generic_buffer.hpp +++ b/opencl/util/generic_buffer.hpp @@ -18,105 +18,89 @@ namespace hpx { namespace opencl { namespace util { - namespace detail{ - - template - struct generic_buffer_impl{ - typedef hpx::future > - data_type; - public: - static hpx::future - get(data_type && data){ - return data.then( - [] (data_type && data) -> T - { - hpx::serialization::serialize_buffer - raw_data = data.get(); - - // Compare lengths - HPX_ASSERT(sizeof(T) == raw_data.size()); - - return * reinterpret_cast(raw_data.data()); - }); - }; - }; - - template - struct generic_buffer_impl>{ - typedef hpx::future > - data_type; - public: - static hpx::future> - get(data_type && data){ - return data.then( - [] (data_type && data) -> std::vector - { - - hpx::serialization::serialize_buffer - raw_data = data.get(); - - // Compute number of elements - std::size_t num_elements = raw_data.size() / sizeof(T); - - // Initialize result vector - std::vector result; - result.reserve(num_elements); - - // Fill result vector - for(std::size_t i = 0; i + sizeof(T) <= raw_data.size(); - i+=sizeof(T)) - { - result.push_back( - *reinterpret_cast(&raw_data.data()[i]) ); - } - - /* Compare lengths */ - HPX_ASSERT(result.size() == num_elements); - - return result; - }); - }; - }; - - template<> - struct generic_buffer_impl{ - typedef hpx::future > - data_type; - public: - HPX_OPENCL_EXPORT static hpx::future - get(data_type && data); - }; - }; - - ///////////////////////////////////////// - /// @brief An accelerator device. - /// - class HPX_OPENCL_EXPORT generic_buffer - { - typedef hpx::future > - data_type; - public: - generic_buffer(data_type && data_) : data(std::move(data_)){} - - /** - * @brief Converts the info to a generic datatype. - * - * @return The converted result - */ - template - hpx::future get() - { - return detail::generic_buffer_impl::get(std::move(data)); - } - - private: - data_type data; - - }; - -}}} - - -#endif// HPX_OPENCL_UTIL_GENERIC_BUFFER_HPP_ - - +namespace detail { + +template +struct generic_buffer_impl { + typedef hpx::future> data_type; + + public: + static hpx::future get(data_type&& data) { + return data.then([](data_type&& data) -> T { + hpx::serialization::serialize_buffer raw_data = data.get(); + + // Compare lengths + HPX_ASSERT(sizeof(T) == raw_data.size()); + + return *reinterpret_cast(raw_data.data()); + }); + }; +}; + +template +struct generic_buffer_impl> { + typedef hpx::future> data_type; + + public: + static hpx::future> get(data_type&& data) { + return data.then([](data_type&& data) -> std::vector { + hpx::serialization::serialize_buffer raw_data = data.get(); + + // Compute number of elements + std::size_t num_elements = raw_data.size() / sizeof(T); + + // Initialize result vector + std::vector result; + result.reserve(num_elements); + + // Fill result vector + for (std::size_t i = 0; i + sizeof(T) <= raw_data.size(); + i += sizeof(T)) { + result.push_back(*reinterpret_cast(&raw_data.data()[i])); + } + + /* Compare lengths */ + HPX_ASSERT(result.size() == num_elements); + + return result; + }); + }; +}; + +template <> +struct generic_buffer_impl { + typedef hpx::future> data_type; + + public: + HPX_OPENCL_EXPORT static hpx::future get(data_type&& data); +}; +}; // namespace detail + +///////////////////////////////////////// +/// @brief An accelerator device. +/// +class HPX_OPENCL_EXPORT generic_buffer { + typedef hpx::future> data_type; + + public: + generic_buffer(data_type&& data_) : data(std::move(data_)) {} + + /** + * @brief Converts the info to a generic datatype. + * + * @return The converted result + */ + template + hpx::future get() { + return detail::generic_buffer_impl::get(std::move(data)); + } + + private: + data_type data; +}; + +} // namespace util +} // namespace opencl +} // namespace hpx + +#endif // HPX_OPENCL_UTIL_GENERIC_BUFFER_HPP_ diff --git a/opencl/util/rect_props.hpp b/opencl/util/rect_props.hpp index 47713c99..7d79c137 100644 --- a/opencl/util/rect_props.hpp +++ b/opencl/util/rect_props.hpp @@ -17,137 +17,120 @@ namespace hpx { namespace opencl { - ////////////////////////////////////// - /// @brief Metadata vector for _rect copy operations - /// - /// This structure is used for Rect data copy functions. - /// - struct rect_props - { - public: - std::size_t src_x; - std::size_t src_y; - std::size_t src_z; - std::size_t dst_x; - std::size_t dst_y; - std::size_t dst_z; - std::size_t size_x; - std::size_t size_y; - std::size_t size_z; - std::size_t src_stride_y; - std::size_t src_stride_z; - std::size_t dst_stride_y; - std::size_t dst_stride_z; +////////////////////////////////////// +/// @brief Metadata vector for _rect copy operations +/// +/// This structure is used for Rect data copy functions. +/// +struct rect_props { + public: + std::size_t src_x; + std::size_t src_y; + std::size_t src_z; + std::size_t dst_x; + std::size_t dst_y; + std::size_t dst_z; + std::size_t size_x; + std::size_t size_y; + std::size_t size_z; + std::size_t src_stride_y; + std::size_t src_stride_z; + std::size_t dst_stride_y; + std::size_t dst_stride_z; - rect_props( std::size_t src_x_, - std::size_t src_y_, - std::size_t src_z_, - std::size_t dst_x_, - std::size_t dst_y_, - std::size_t dst_z_, - std::size_t size_x_, - std::size_t size_y_, - std::size_t size_z_, - std::size_t src_stride_y_, - std::size_t src_stride_z_, - std::size_t dst_stride_y_, - std::size_t dst_stride_z_ ) - : src_x(src_x_), - src_y(src_y_), - src_z(src_z_), - dst_x(dst_x_), - dst_y(dst_y_), - dst_z(dst_z_), - size_x(size_x_), - size_y(size_y_), - size_z(size_z_), - src_stride_y(src_stride_y_), - src_stride_z(src_stride_z_), - dst_stride_y(dst_stride_y_), - dst_stride_z(dst_stride_z_){} - rect_props( std::size_t src_x_, - std::size_t src_y_, - std::size_t dst_x_, - std::size_t dst_y_, - std::size_t size_x_, - std::size_t size_y_, - std::size_t src_stride_y_, - std::size_t dst_stride_y_) - : src_x(src_x_), - src_y(src_y_), - src_z(0), - dst_x(dst_x_), - dst_y(dst_y_), - dst_z(0), - size_x(size_x_), - size_y(size_y_), - size_z(1), - src_stride_y(src_stride_y_), - src_stride_z(0), - dst_stride_y(dst_stride_y_), - dst_stride_z(0){} - rect_props() - : src_x(0), - src_y(0), - src_z(0), - dst_x(0), - dst_y(0), - dst_z(0), - size_x(1), - size_y(1), - size_z(1), - src_stride_y(0), - src_stride_z(0), - dst_stride_y(0), - dst_stride_z(0){} - private: - // serialization support - friend class hpx::serialization::access; + rect_props(std::size_t src_x_, std::size_t src_y_, std::size_t src_z_, + std::size_t dst_x_, std::size_t dst_y_, std::size_t dst_z_, + std::size_t size_x_, std::size_t size_y_, std::size_t size_z_, + std::size_t src_stride_y_, std::size_t src_stride_z_, + std::size_t dst_stride_y_, std::size_t dst_stride_z_) + : src_x(src_x_), + src_y(src_y_), + src_z(src_z_), + dst_x(dst_x_), + dst_y(dst_y_), + dst_z(dst_z_), + size_x(size_x_), + size_y(size_y_), + size_z(size_z_), + src_stride_y(src_stride_y_), + src_stride_z(src_stride_z_), + dst_stride_y(dst_stride_y_), + dst_stride_z(dst_stride_z_) {} + rect_props(std::size_t src_x_, std::size_t src_y_, std::size_t dst_x_, + std::size_t dst_y_, std::size_t size_x_, std::size_t size_y_, + std::size_t src_stride_y_, std::size_t dst_stride_y_) + : src_x(src_x_), + src_y(src_y_), + src_z(0), + dst_x(dst_x_), + dst_y(dst_y_), + dst_z(0), + size_x(size_x_), + size_y(size_y_), + size_z(1), + src_stride_y(src_stride_y_), + src_stride_z(0), + dst_stride_y(dst_stride_y_), + dst_stride_z(0) {} + rect_props() + : src_x(0), + src_y(0), + src_z(0), + dst_x(0), + dst_y(0), + dst_z(0), + size_x(1), + size_y(1), + size_z(1), + src_stride_y(0), + src_stride_z(0), + dst_stride_y(0), + dst_stride_z(0) {} - /////////////////////////////////////////////////////////////////////// - template - void save(Archive& ar, const unsigned int version) const - { - ar << src_x; - ar << src_y; - ar << src_z; - ar << dst_x; - ar << dst_y; - ar << dst_z; - ar << size_x; - ar << size_y; - ar << size_z; - ar << src_stride_y; - ar << src_stride_z; - ar << dst_stride_y; - ar << dst_stride_z; - } + private: + // serialization support + friend class hpx::serialization::access; - /////////////////////////////////////////////////////////////////////// - template - void load(Archive& ar, const unsigned int version) - { - ar >> src_x; - ar >> src_y; - ar >> src_z; - ar >> dst_x; - ar >> dst_y; - ar >> dst_z; - ar >> size_x; - ar >> size_y; - ar >> size_z; - ar >> src_stride_y; - ar >> src_stride_z; - ar >> dst_stride_y; - ar >> dst_stride_z; - } + /////////////////////////////////////////////////////////////////////// + template + void save(Archive& ar, const unsigned int version) const { + ar << src_x; + ar << src_y; + ar << src_z; + ar << dst_x; + ar << dst_y; + ar << dst_z; + ar << size_x; + ar << size_y; + ar << size_z; + ar << src_stride_y; + ar << src_stride_z; + ar << dst_stride_y; + ar << dst_stride_z; + } - HPX_SERIALIZATION_SPLIT_MEMBER() - }; + /////////////////////////////////////////////////////////////////////// + template + void load(Archive& ar, const unsigned int version) { + ar >> src_x; + ar >> src_y; + ar >> src_z; + ar >> dst_x; + ar >> dst_y; + ar >> dst_z; + ar >> size_x; + ar >> size_y; + ar >> size_z; + ar >> src_stride_y; + ar >> src_stride_z; + ar >> dst_stride_y; + ar >> dst_stride_z; + } -}} - - -#endif// HPX_OPENCL_UTIL_RECT_PROPS_HPP_ + HPX_SERIALIZATION_SPLIT_MEMBER() +}; +} // namespace opencl +} // namespace hpx +#endif // HPX_OPENCL_UTIL_RECT_PROPS_HPP_ diff --git a/tests/performance/opencl/bandwith.cpp b/tests/performance/opencl/bandwith.cpp index ab84c87b..55cc6e0c 100644 --- a/tests/performance/opencl/bandwith.cpp +++ b/tests/performance/opencl/bandwith.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "util/cl_tests.hpp" #include "util/testresults.hpp" @@ -14,460 +13,368 @@ typedef hpx::serialization::serialize_buffer buffer_type; - // global variables static buffer_type test_data; -buffer_type -loopback(buffer_type buf){ - return buf; -} +buffer_type loopback(buffer_type buf) { return buf; } HPX_PLAIN_ACTION(loopback, loopback_action); -static void ensure_valid( buffer_type result ) -{ - if( result.size() != test_data.size() ){ - die("result size is wrong!"); - } +static void ensure_valid(buffer_type result) { + if (result.size() != test_data.size()) { + die("result size is wrong!"); + } - for( std::size_t i = 0; i < result.size(); i++ ){ - if(test_data[i] != result[i]) - die("result is wrong!"); - } + for (std::size_t i = 0; i < result.size(); i++) { + if (test_data[i] != result[i]) die("result is wrong!"); + } } - -static void run_opencl_local_test( hpx::opencl::device device ) -{ - - hpx::opencl::buffer buffer = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - auto device_ptr = - hpx::get_ptr(device.get_id()).get(); - auto buffer_ptr = - hpx::get_ptr(buffer.get_id()).get(); - - - cl_context context = device_ptr->get_context(); - cl_command_queue write_command_queue = device_ptr->get_write_command_queue(); - cl_command_queue read_command_queue = device_ptr->get_read_command_queue(); - cl_mem buffer_id = buffer_ptr->get_cl_mem(); - - std::map atts; - atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test("OpenCL_local_host_to_local_device", "GB/s", atts); - - const std::size_t data_transfer_per_test = - test_data.size() * 2 * num_iterations; - - - double throughput_gbps = 0.0; - while(results.needs_more_testing()) - { - // initialize the buffer - buffer_type buf ( test_data.size() ); - std::copy(test_data.data(), test_data.data()+test_data.size(), buf.data()); - - cl_int err; - - err = clFinish(read_command_queue); - cl_ensure(err, "clFinish()"); - err = clFinish(write_command_queue); - cl_ensure(err, "clFinish()"); - - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - err = clEnqueueWriteBuffer( write_command_queue, - buffer_id, - CL_TRUE, - 0, - buf.size(), - buf.data(), - 0, NULL, NULL ); - cl_ensure(err, "clEnqueueWriteBuffer()"); - - err = clEnqueueReadBuffer( read_command_queue, - buffer_id, - CL_TRUE, - 0, - buf.size(), - buf.data(), - 0, NULL, NULL ); - cl_ensure(err, "clEnqueueReadBuffer()"); - } - - - const double duration = walltime.elapsed(); - ensure_valid(buf); - - const double throughput = data_transfer_per_test / duration; - throughput_gbps = throughput/(1024.0*1024.0*1024.0); - - results.add(throughput_gbps); +static void run_opencl_local_test(hpx::opencl::device device) { + hpx::opencl::buffer buffer = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + auto device_ptr = + hpx::get_ptr(device.get_id()).get(); + auto buffer_ptr = + hpx::get_ptr(buffer.get_id()).get(); + + cl_context context = device_ptr->get_context(); + cl_command_queue write_command_queue = device_ptr->get_write_command_queue(); + cl_command_queue read_command_queue = device_ptr->get_read_command_queue(); + cl_mem buffer_id = buffer_ptr->get_cl_mem(); + + std::map atts; + atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test("OpenCL_local_host_to_local_device", "GB/s", atts); + + const std::size_t data_transfer_per_test = + test_data.size() * 2 * num_iterations; + + double throughput_gbps = 0.0; + while (results.needs_more_testing()) { + // initialize the buffer + buffer_type buf(test_data.size()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + buf.data()); + + cl_int err; + + err = clFinish(read_command_queue); + cl_ensure(err, "clFinish()"); + err = clFinish(write_command_queue); + cl_ensure(err, "clFinish()"); + + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + err = clEnqueueWriteBuffer(write_command_queue, buffer_id, CL_TRUE, 0, + buf.size(), buf.data(), 0, NULL, NULL); + cl_ensure(err, "clEnqueueWriteBuffer()"); + + err = clEnqueueReadBuffer(read_command_queue, buffer_id, CL_TRUE, 0, + buf.size(), buf.data(), 0, NULL, NULL); + cl_ensure(err, "clEnqueueReadBuffer()"); } + const double duration = walltime.elapsed(); + ensure_valid(buf); + const double throughput = data_transfer_per_test / duration; + throughput_gbps = throughput / (1024.0 * 1024.0 * 1024.0); + results.add(throughput_gbps); + } } - -static void run_opencl_local_send_test( hpx::opencl::device device ) -{ - - hpx::opencl::buffer buffer1 = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - hpx::opencl::buffer buffer2 = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - auto device_ptr = - hpx::get_ptr(device.get_id()).get(); - auto buffer1_ptr = - hpx::get_ptr(buffer1.get_id()).get(); - auto buffer2_ptr = - hpx::get_ptr(buffer2.get_id()).get(); - - - cl_context context = device_ptr->get_context(); - cl_command_queue command_queue = device_ptr->get_write_command_queue(); - cl_mem buffer1_id = buffer1_ptr->get_cl_mem(); - cl_mem buffer2_id = buffer2_ptr->get_cl_mem(); - - std::map atts; - atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test("OpenCL_local_device_to_local_device", "GB/s", atts); - - const std::size_t data_transfer_per_test = - test_data.size() * 2 * num_iterations; - - - double throughput_gbps = 0.0; - while(results.needs_more_testing()) - { - // initialize the buffer - buffer_type buf ( test_data.size() ); - std::copy(test_data.data(), test_data.data()+test_data.size(), buf.data()); - - cl_int err; - cl_event event, new_event; - - err = clEnqueueWriteBuffer( command_queue, - buffer1_id, - CL_TRUE, - 0, - buf.size(), - buf.data(), - 0, NULL, &event ); - cl_ensure(err, "clEnqueueWriteBuffer()"); - - err = clFinish(command_queue); - cl_ensure(err, "clFinish()"); - - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - err = clEnqueueCopyBuffer( command_queue, - buffer1_id, - buffer2_id, - 0, 0, - buf.size(), - 1, &event, - &new_event ); - cl_ensure(err, "clEnqueueCopyBuffer()"); - - err = clReleaseEvent(event); - cl_ensure(err, "clReleaseEvent()"); - event = new_event; - - err = clEnqueueCopyBuffer( command_queue, - buffer2_id, - buffer1_id, - 0, 0, - buf.size(), - 1, &event, - &new_event ); - cl_ensure(err, "clEnqueueCopyBuffer()"); - - err = clReleaseEvent(event); - cl_ensure(err, "clReleaseEvent()"); - event = new_event; - } - - err = clWaitForEvents(1, &event); - cl_ensure(err, "clWaitForEvents()"); - - const double duration = walltime.elapsed(); - - err = clReleaseEvent(event); - cl_ensure(err, "clReleaseEvent()"); - - err = clEnqueueReadBuffer( command_queue, - buffer1_id, - CL_TRUE, - 0, - buf.size(), - buf.data(), - 0, NULL, NULL ); - cl_ensure(err, "clEnqueueReadBuffer()"); - ensure_valid(buf); - - err = clEnqueueReadBuffer( command_queue, - buffer2_id, - CL_TRUE, - 0, - buf.size(), - buf.data(), - 0, NULL, NULL ); - cl_ensure(err, "clEnqueueReadBuffer()"); - ensure_valid(buf); - - const double throughput = data_transfer_per_test / duration; - throughput_gbps = throughput/(1024.0*1024.0*1024.0); - - results.add(throughput_gbps); +static void run_opencl_local_send_test(hpx::opencl::device device) { + hpx::opencl::buffer buffer1 = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + hpx::opencl::buffer buffer2 = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + auto device_ptr = + hpx::get_ptr(device.get_id()).get(); + auto buffer1_ptr = + hpx::get_ptr(buffer1.get_id()).get(); + auto buffer2_ptr = + hpx::get_ptr(buffer2.get_id()).get(); + + cl_context context = device_ptr->get_context(); + cl_command_queue command_queue = device_ptr->get_write_command_queue(); + cl_mem buffer1_id = buffer1_ptr->get_cl_mem(); + cl_mem buffer2_id = buffer2_ptr->get_cl_mem(); + + std::map atts; + atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test("OpenCL_local_device_to_local_device", "GB/s", atts); + + const std::size_t data_transfer_per_test = + test_data.size() * 2 * num_iterations; + + double throughput_gbps = 0.0; + while (results.needs_more_testing()) { + // initialize the buffer + buffer_type buf(test_data.size()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + buf.data()); + + cl_int err; + cl_event event, new_event; + + err = clEnqueueWriteBuffer(command_queue, buffer1_id, CL_TRUE, 0, + buf.size(), buf.data(), 0, NULL, &event); + cl_ensure(err, "clEnqueueWriteBuffer()"); + + err = clFinish(command_queue); + cl_ensure(err, "clFinish()"); + + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + err = clEnqueueCopyBuffer(command_queue, buffer1_id, buffer2_id, 0, 0, + buf.size(), 1, &event, &new_event); + cl_ensure(err, "clEnqueueCopyBuffer()"); + + err = clReleaseEvent(event); + cl_ensure(err, "clReleaseEvent()"); + event = new_event; + + err = clEnqueueCopyBuffer(command_queue, buffer2_id, buffer1_id, 0, 0, + buf.size(), 1, &event, &new_event); + cl_ensure(err, "clEnqueueCopyBuffer()"); + + err = clReleaseEvent(event); + cl_ensure(err, "clReleaseEvent()"); + event = new_event; } + err = clWaitForEvents(1, &event); + cl_ensure(err, "clWaitForEvents()"); + const double duration = walltime.elapsed(); -} - - -static void run_hpxcl_send_test( hpx::opencl::device device1, - hpx::opencl::device device2 ) -{ - - hpx::opencl::buffer buffer1 = - device1.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - hpx::opencl::buffer buffer2 = - device2.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - - - std::string device1_location = "remote"; - if(hpx::get_colocation_id(hpx::launch::sync, device1.get_id()) == hpx::find_here()) - device1_location = "local"; - std::string device2_location = "remote"; - if(hpx::get_colocation_id(hpx::launch::sync, device2.get_id()) == hpx::find_here()) - device2_location = "local"; - - std::map atts; - atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test("HPXCL_" + device1_location + "_device_to_" + - device2_location + "_device", - "GB/s", atts); - - const std::size_t data_transfer_per_test = - test_data.size() * 2 * num_iterations; - - double throughput_gbps = 0.0; - while(results.needs_more_testing()) - { - // initialize the buffer - hpx::future fut = buffer1.enqueue_write(0, test_data); - - fut.wait(); - - // RUN! - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - // Copy from buffer1 to buffer2 - auto send_result = - buffer1.enqueue_send(buffer2, 0, 0, test_data.size(), fut); - - // Copy from buffer2 to buffer1 - auto send_result2 = - buffer2.enqueue_send(buffer1, 0, 0, test_data.size(), - send_result.dst_future); - fut = std::move(send_result2.dst_future); - } - - // wait for last send to finish - fut.get(); + err = clReleaseEvent(event); + cl_ensure(err, "clReleaseEvent()"); - // Measure elapsed time - const double duration = walltime.elapsed(); + err = clEnqueueReadBuffer(command_queue, buffer1_id, CL_TRUE, 0, buf.size(), + buf.data(), 0, NULL, NULL); + cl_ensure(err, "clEnqueueReadBuffer()"); + ensure_valid(buf); - // Check if data is still valid - ensure_valid(buffer1.enqueue_read(0, test_data.size()).get()); + err = clEnqueueReadBuffer(command_queue, buffer2_id, CL_TRUE, 0, buf.size(), + buf.data(), 0, NULL, NULL); + cl_ensure(err, "clEnqueueReadBuffer()"); + ensure_valid(buf); - // Calculate throughput - const double throughput = data_transfer_per_test / duration; - throughput_gbps = throughput/(1024.0*1024.0*1024.0); - - results.add(throughput_gbps); - } + const double throughput = data_transfer_per_test / duration; + throughput_gbps = throughput / (1024.0 * 1024.0 * 1024.0); + results.add(throughput_gbps); + } } -static void run_hpxcl_read_write_test( hpx::opencl::device device ) -{ - - hpx::opencl::buffer buffer = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - - - std::map atts; - atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - if(hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == hpx::find_here()) - results.start_test("HPXCL_local_host_to_local_device", "GB/s", atts); - else - results.start_test("HPXCL_local_host_to_remote_device", "GB/s", atts); - - const std::size_t data_transfer_per_test = - test_data.size() * 2 * num_iterations; - - double throughput_gbps = 0.0; - while(results.needs_more_testing()) - { - // initialize the buffer - buffer_type read_buf ( test_data.size() ); - buffer_type write_buf ( test_data.size() ); - std::copy( test_data.data(), test_data.data()+test_data.size(), - write_buf.data() ); - - // RUN! - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - // Copy to device - auto fut_tmp = buffer.enqueue_write(0, write_buf); - - // Copy from device - auto fut = buffer.enqueue_read(0, read_buf, fut_tmp); - - // Swap read and write buffer - fut.get(); - std::swap(read_buf, write_buf); - } - - // Measure elapsed time - const double duration = walltime.elapsed(); - - // Check if data is still valid - ensure_valid(write_buf); - - // Calculate throughput - const double throughput = data_transfer_per_test / duration; - throughput_gbps = throughput/(1024.0*1024.0*1024.0); - - results.add(throughput_gbps); +static void run_hpxcl_send_test(hpx::opencl::device device1, + hpx::opencl::device device2) { + hpx::opencl::buffer buffer1 = + device1.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + hpx::opencl::buffer buffer2 = + device2.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + std::string device1_location = "remote"; + if (hpx::get_colocation_id(hpx::launch::sync, device1.get_id()) == + hpx::find_here()) + device1_location = "local"; + std::string device2_location = "remote"; + if (hpx::get_colocation_id(hpx::launch::sync, device2.get_id()) == + hpx::find_here()) + device2_location = "local"; + + std::map atts; + atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test("HPXCL_" + device1_location + "_device_to_" + + device2_location + "_device", + "GB/s", atts); + + const std::size_t data_transfer_per_test = + test_data.size() * 2 * num_iterations; + + double throughput_gbps = 0.0; + while (results.needs_more_testing()) { + // initialize the buffer + hpx::future fut = buffer1.enqueue_write(0, test_data); + + fut.wait(); + + // RUN! + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + // Copy from buffer1 to buffer2 + auto send_result = + buffer1.enqueue_send(buffer2, 0, 0, test_data.size(), fut); + + // Copy from buffer2 to buffer1 + auto send_result2 = buffer2.enqueue_send(buffer1, 0, 0, test_data.size(), + send_result.dst_future); + fut = std::move(send_result2.dst_future); } -} - -static void run_hpx_loopback_test( hpx::naming::id_type target_location ) -{ - - std::map atts; - atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test("HPX_local_host_to_remote_host", "GB/s", atts); + // wait for last send to finish + fut.get(); - const std::size_t data_transfer_per_test = - test_data.size() * 2 * num_iterations; + // Measure elapsed time + const double duration = walltime.elapsed(); + // Check if data is still valid + ensure_valid(buffer1.enqueue_read(0, test_data.size()).get()); - double throughput_gbps = 0.0; - while(results.needs_more_testing()) - { - hpx::util::high_resolution_timer walltime; - - buffer_type test_data_copy = test_data; - hpx::future tmp_result = - hpx::make_ready_future(std::move(test_data_copy)); - for(std::size_t it = 0; it < num_iterations; it ++) - { - tmp_result = tmp_result.then( - [&target_location](hpx::future && result){ - return hpx::async(target_location, result.get()); - }); - } - buffer_type result_data = tmp_result.get(); - - // measure time - const double duration = walltime.elapsed(); - - // make sure result is valid - ensure_valid(result_data); - - // calculate the throughput - const double throughput = data_transfer_per_test / duration; - throughput_gbps = throughput/(1024.0*1024.0*1024.0); - - results.add(throughput_gbps); - } + // Calculate throughput + const double throughput = data_transfer_per_test / duration; + throughput_gbps = throughput / (1024.0 * 1024.0 * 1024.0); + results.add(throughput_gbps); + } } - - -static void cl_test(hpx::opencl::device local_device, - hpx::opencl::device remote_device, - bool distributed) -{ - - if(testdata_size == 0) - testdata_size = static_cast(1) << 20; - if(num_iterations == 0) - num_iterations = 50; - - // Get localities - hpx::naming::id_type remote_location = - hpx::get_colocation_id(hpx::launch::sync, remote_device.get_id()); - hpx::naming::id_type local_location = - hpx::get_colocation_id(hpx::launch::sync, local_device.get_id()); - if(local_location != hpx::find_here()) - die("Internal ERROR! local_location is not here."); - - // Generate random vector - std::cerr << "Generating test data ..." << std::endl; - test_data = buffer_type ( testdata_size ); - std::cerr << "Test data generated." << std::endl; - for(std::size_t i = 0; i < testdata_size; i++){ - test_data[i] = static_cast(rand()); +static void run_hpxcl_read_write_test(hpx::opencl::device device) { + hpx::opencl::buffer buffer = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + std::map atts; + atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + if (hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == + hpx::find_here()) + results.start_test("HPXCL_local_host_to_local_device", "GB/s", atts); + else + results.start_test("HPXCL_local_host_to_remote_device", "GB/s", atts); + + const std::size_t data_transfer_per_test = + test_data.size() * 2 * num_iterations; + + double throughput_gbps = 0.0; + while (results.needs_more_testing()) { + // initialize the buffer + buffer_type read_buf(test_data.size()); + buffer_type write_buf(test_data.size()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + write_buf.data()); + + // RUN! + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + // Copy to device + auto fut_tmp = buffer.enqueue_write(0, write_buf); + + // Copy from device + auto fut = buffer.enqueue_read(0, read_buf, fut_tmp); + + // Swap read and write buffer + fut.get(); + std::swap(read_buf, write_buf); } + // Measure elapsed time + const double duration = walltime.elapsed(); - // Run local opencl test - run_opencl_local_test(local_device); + // Check if data is still valid + ensure_valid(write_buf); - // Run local opencl send test - run_opencl_local_send_test(local_device); + // Calculate throughput + const double throughput = data_transfer_per_test / duration; + throughput_gbps = throughput / (1024.0 * 1024.0 * 1024.0); - if(distributed){ - // Run hpx loopback test - run_hpx_loopback_test(remote_location); - } - - // Run local hpxcl test - run_hpxcl_read_write_test(local_device); + results.add(throughput_gbps); + } +} - if(distributed){ - // Run remote hpxcl test - run_hpxcl_read_write_test(remote_device); +static void run_hpx_loopback_test(hpx::naming::id_type target_location) { + std::map atts; + atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test("HPX_local_host_to_remote_host", "GB/s", atts); + + const std::size_t data_transfer_per_test = + test_data.size() * 2 * num_iterations; + + double throughput_gbps = 0.0; + while (results.needs_more_testing()) { + hpx::util::high_resolution_timer walltime; + + buffer_type test_data_copy = test_data; + hpx::future tmp_result = + hpx::make_ready_future(std::move(test_data_copy)); + for (std::size_t it = 0; it < num_iterations; it++) { + tmp_result = tmp_result.then( + [&target_location](hpx::future&& result) { + return hpx::async(target_location, result.get()); + }); } + buffer_type result_data = tmp_result.get(); - // Run hpxcl send local-local test - run_hpxcl_send_test(local_device, local_device); + // measure time + const double duration = walltime.elapsed(); - if(distributed){ - // Run hpxcl send remote-remote test - run_hpxcl_send_test(remote_device, remote_device); - - // Run hpxcl send local-remote test - run_hpxcl_send_test(local_device, remote_device); - } + // make sure result is valid + ensure_valid(result_data); + // calculate the throughput + const double throughput = data_transfer_per_test / duration; + throughput_gbps = throughput / (1024.0 * 1024.0 * 1024.0); + results.add(throughput_gbps); + } } - - +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device remote_device, bool distributed) { + if (testdata_size == 0) testdata_size = static_cast(1) << 20; + if (num_iterations == 0) num_iterations = 50; + + // Get localities + hpx::naming::id_type remote_location = + hpx::get_colocation_id(hpx::launch::sync, remote_device.get_id()); + hpx::naming::id_type local_location = + hpx::get_colocation_id(hpx::launch::sync, local_device.get_id()); + if (local_location != hpx::find_here()) + die("Internal ERROR! local_location is not here."); + + // Generate random vector + std::cerr << "Generating test data ..." << std::endl; + test_data = buffer_type(testdata_size); + std::cerr << "Test data generated." << std::endl; + for (std::size_t i = 0; i < testdata_size; i++) { + test_data[i] = static_cast(rand()); + } + + // Run local opencl test + run_opencl_local_test(local_device); + + // Run local opencl send test + run_opencl_local_send_test(local_device); + + if (distributed) { + // Run hpx loopback test + run_hpx_loopback_test(remote_location); + } + + // Run local hpxcl test + run_hpxcl_read_write_test(local_device); + + if (distributed) { + // Run remote hpxcl test + run_hpxcl_read_write_test(remote_device); + } + + // Run hpxcl send local-local test + run_hpxcl_send_test(local_device, local_device); + + if (distributed) { + // Run hpxcl send remote-remote test + run_hpxcl_send_test(remote_device, remote_device); + + // Run hpxcl send local-remote test + run_hpxcl_send_test(local_device, remote_device); + } +} diff --git a/tests/performance/opencl/overhead.cpp b/tests/performance/opencl/overhead.cpp index e8a8f92a..788947c3 100644 --- a/tests/performance/opencl/overhead.cpp +++ b/tests/performance/opencl/overhead.cpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "util/cl_tests.hpp" #include "util/testresults.hpp" @@ -14,364 +13,315 @@ typedef hpx::serialization::serialize_buffer buffer_type; - // global variables static buffer_type test_data; -buffer_type -loopback(buffer_type buf){ - return buf; -} +buffer_type loopback(buffer_type buf) { return buf; } HPX_PLAIN_ACTION(loopback, loopback_action); -static void ensure_valid( buffer_type result ) -{ - if( result.size() != test_data.size() ){ - die("result size is wrong!"); - } +static void ensure_valid(buffer_type result) { + if (result.size() != test_data.size()) { + die("result size is wrong!"); + } - for( std::size_t i = 0; i < result.size(); i++ ){ - if(test_data[i] != result[i]) - die("result is wrong!"); - } + for (std::size_t i = 0; i < result.size(); i++) { + if (test_data[i] != result[i]) die("result is wrong!"); + } } -static void send_test( hpx::opencl::device device1, - hpx::opencl::device device2, - bool sync ) -{ - - hpx::opencl::buffer buffer1 = - device1.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - hpx::opencl::buffer buffer2 = - device2.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - std::string name = "send_"; - - if(hpx::get_colocation_id(hpx::launch::sync, device1.get_id()) == hpx::find_here()) - name += "local_"; - else - name += "remote_"; - - if(hpx::get_colocation_id(hpx::launch::sync, device2.get_id()) == hpx::find_here()) - name += "local"; - else - name += "remote"; +static void send_test(hpx::opencl::device device1, hpx::opencl::device device2, + bool sync) { + hpx::opencl::buffer buffer1 = + device1.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + hpx::opencl::buffer buffer2 = + device2.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - if(sync) - name += "_sync"; + std::string name = "send_"; - std::map atts; -// atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test(name, "ms", atts); + if (hpx::get_colocation_id(hpx::launch::sync, device1.get_id()) == + hpx::find_here()) + name += "local_"; + else + name += "remote_"; + if (hpx::get_colocation_id(hpx::launch::sync, device2.get_id()) == + hpx::find_here()) + name += "local"; + else + name += "remote"; - while(results.needs_more_testing()) - { - // initialize the buffer - hpx::future fut = buffer1.enqueue_write(0, test_data); + if (sync) name += "_sync"; - fut.wait(); + std::map atts; + // atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test(name, "ms", atts); - // RUN! - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - // Copy from buffer1 to buffer2 - auto send_result = - buffer1.enqueue_send(buffer2, 0, 0, test_data.size(), fut); + while (results.needs_more_testing()) { + // initialize the buffer + hpx::future fut = buffer1.enqueue_write(0, test_data); - fut = std::move(send_result.dst_future); + fut.wait(); - if(sync) - fut.wait(); - } + // RUN! + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + // Copy from buffer1 to buffer2 + auto send_result = + buffer1.enqueue_send(buffer2, 0, 0, test_data.size(), fut); - // wait for last send to finish - fut.get(); + fut = std::move(send_result.dst_future); - // Measure elapsed time - const double duration = walltime.elapsed(); - - // Check if data is still valid - ensure_valid(buffer2.enqueue_read(0, test_data.size()).get()); - - // Calculate overhead - const double overhead = duration * 1000.0 / num_iterations; - - results.add(overhead); + if (sync) fut.wait(); } -} - -static void wait_test( hpx::opencl::device device ) -{ - - hpx::opencl::buffer buffer = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - - std::string name = "wait_"; - + // wait for last send to finish + fut.get(); - if(hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == hpx::find_here()) - name += "local"; - else - name += "remote"; + // Measure elapsed time + const double duration = walltime.elapsed(); - std::map atts; -// atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test(name, "ms", atts); + // Check if data is still valid + ensure_valid(buffer2.enqueue_read(0, test_data.size()).get()); - while(results.needs_more_testing()) - { - // initialize the buffer - buffer_type write_buf1 ( test_data.size() ); - buffer_type write_buf2 ( test_data.size() ); - std::copy( test_data.data(), test_data.data()+test_data.size(), - write_buf1.data() ); - std::copy( test_data.data(), test_data.data()+test_data.size(), - write_buf2.data() ); - - double duration = 0.0; - - // RUN! - for(std::size_t it = 0; it < num_iterations; it ++) - { - // Copy to device - auto fut1 = buffer.enqueue_write(0, write_buf1); - - // Copy to device again, with dependency to fut1 - auto fut2 = buffer.enqueue_write(0, write_buf2, fut1); - - // wait for fut2 - fut2.get(); - - // fut1 is definitely ready now, but unchecked. - // now measure how long it takes to check fut1 - hpx::util::high_resolution_timer walltime; - fut1.get(); - duration += walltime.elapsed(); - - } - - // Calculate overhead - const double overhead = duration * 1000.0 / num_iterations; - - results.add(overhead); - } + // Calculate overhead + const double overhead = duration * 1000.0 / num_iterations; + results.add(overhead); + } } -static void write_test( hpx::opencl::device device , bool sync ) -{ - - hpx::opencl::buffer buffer = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - - std::string name = "write_"; - - - if(hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == hpx::find_here()) - name += "local"; - else - name += "remote"; - - if(sync) - name += "_sync"; - - std::map atts; -// atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test(name, "ms", atts); - - while(results.needs_more_testing()) - { - // initialize the buffer - buffer_type write_buf ( test_data.size() ); - std::copy( test_data.data(), test_data.data()+test_data.size(), - write_buf.data() ); - - hpx::future fut; - bool is_first_iteration = true; - - // RUN! - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - if(is_first_iteration){ - // Copy to device - fut = buffer.enqueue_write(0, write_buf); - is_first_iteration = false; - } else { - // Copy to device - fut = buffer.enqueue_write(0, write_buf, fut); - } - - // wait - if(sync) - fut.wait(); - } - - // wait for finish - fut.get(); - - // Measure elapsed time - const double duration = walltime.elapsed(); - - // Check if data is still valid - ensure_valid(buffer.enqueue_read(0, test_data.size()).get()); - - // Calculate overhead - const double overhead = duration * 1000.0 / num_iterations; - - results.add(overhead); +static void wait_test(hpx::opencl::device device) { + hpx::opencl::buffer buffer = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + std::string name = "wait_"; + + if (hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == + hpx::find_here()) + name += "local"; + else + name += "remote"; + + std::map atts; + // atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test(name, "ms", atts); + + while (results.needs_more_testing()) { + // initialize the buffer + buffer_type write_buf1(test_data.size()); + buffer_type write_buf2(test_data.size()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + write_buf1.data()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + write_buf2.data()); + + double duration = 0.0; + + // RUN! + for (std::size_t it = 0; it < num_iterations; it++) { + // Copy to device + auto fut1 = buffer.enqueue_write(0, write_buf1); + + // Copy to device again, with dependency to fut1 + auto fut2 = buffer.enqueue_write(0, write_buf2, fut1); + + // wait for fut2 + fut2.get(); + + // fut1 is definitely ready now, but unchecked. + // now measure how long it takes to check fut1 + hpx::util::high_resolution_timer walltime; + fut1.get(); + duration += walltime.elapsed(); } -} - + // Calculate overhead + const double overhead = duration * 1000.0 / num_iterations; -static void read_test( hpx::opencl::device device , bool sync ) -{ - - hpx::opencl::buffer buffer = - device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); - - - std::string name = "read_"; - - - if(hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == hpx::find_here()) - name += "local"; - else - name += "remote"; - - if(sync) - name += "_sync"; + results.add(overhead); + } +} - std::map atts; -// atts["size"] = std::to_string(test_data.size()); - atts["iterations"] = std::to_string(num_iterations); - results.start_test(name, "ms", atts); +static void write_test(hpx::opencl::device device, bool sync) { + hpx::opencl::buffer buffer = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + std::string name = "write_"; + + if (hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == + hpx::find_here()) + name += "local"; + else + name += "remote"; + + if (sync) name += "_sync"; + + std::map atts; + // atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test(name, "ms", atts); + + while (results.needs_more_testing()) { + // initialize the buffer + buffer_type write_buf(test_data.size()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + write_buf.data()); + + hpx::future fut; + bool is_first_iteration = true; + + // RUN! + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + if (is_first_iteration) { + // Copy to device + fut = buffer.enqueue_write(0, write_buf); + is_first_iteration = false; + } else { + // Copy to device + fut = buffer.enqueue_write(0, write_buf, fut); + } + + // wait + if (sync) fut.wait(); + } - while(results.needs_more_testing()) - { - // initialize the buffer - buffer_type write_buf ( test_data.size() ); - std::copy( test_data.data(), test_data.data()+test_data.size(), - write_buf.data() ); + // wait for finish + fut.get(); - buffer.enqueue_write(0, write_buf).get(); + // Measure elapsed time + const double duration = walltime.elapsed(); - hpx::future fut; - bool is_first_iteration = true; + // Check if data is still valid + ensure_valid(buffer.enqueue_read(0, test_data.size()).get()); - // RUN! - hpx::util::high_resolution_timer walltime; - for(std::size_t it = 0; it < num_iterations; it ++) - { - if(is_first_iteration){ - // Copy from device - fut = buffer.enqueue_read(0, write_buf); - is_first_iteration = false; - } else { - // Copy from device - fut = buffer.enqueue_read(0, write_buf, fut); - } + // Calculate overhead + const double overhead = duration * 1000.0 / num_iterations; - // wait - if(sync) - fut.wait(); - } + results.add(overhead); + } +} - // wait for finish - write_buf = fut.get(); +static void read_test(hpx::opencl::device device, bool sync) { + hpx::opencl::buffer buffer = + device.create_buffer(CL_MEM_READ_WRITE, test_data.size()); + + std::string name = "read_"; + + if (hpx::get_colocation_id(hpx::launch::sync, device.get_id()) == + hpx::find_here()) + name += "local"; + else + name += "remote"; + + if (sync) name += "_sync"; + + std::map atts; + // atts["size"] = std::to_string(test_data.size()); + atts["iterations"] = std::to_string(num_iterations); + results.start_test(name, "ms", atts); + + while (results.needs_more_testing()) { + // initialize the buffer + buffer_type write_buf(test_data.size()); + std::copy(test_data.data(), test_data.data() + test_data.size(), + write_buf.data()); + + buffer.enqueue_write(0, write_buf).get(); + + hpx::future fut; + bool is_first_iteration = true; + + // RUN! + hpx::util::high_resolution_timer walltime; + for (std::size_t it = 0; it < num_iterations; it++) { + if (is_first_iteration) { + // Copy from device + fut = buffer.enqueue_read(0, write_buf); + is_first_iteration = false; + } else { + // Copy from device + fut = buffer.enqueue_read(0, write_buf, fut); + } + + // wait + if (sync) fut.wait(); + } - // Measure elapsed time - const double duration = walltime.elapsed(); + // wait for finish + write_buf = fut.get(); - // Check if data is still valid - ensure_valid(write_buf); + // Measure elapsed time + const double duration = walltime.elapsed(); - // Calculate throughput - const double overhead = duration * 1000.0 / num_iterations; + // Check if data is still valid + ensure_valid(write_buf); - results.add(overhead); - } + // Calculate throughput + const double overhead = duration * 1000.0 / num_iterations; + results.add(overhead); + } } static void cl_test(hpx::opencl::device local_device, - hpx::opencl::device remote_device, - bool distributed ) -{ - - testdata_size = 1; - - if(num_iterations == 0) - num_iterations = 50; - - // Get localities - hpx::naming::id_type remote_location = - hpx::get_colocation_id(hpx::launch::sync, remote_device.get_id()); - hpx::naming::id_type local_location = - hpx::get_colocation_id(hpx::launch::sync, local_device.get_id()); - if(local_location != hpx::find_here()) - die("Internal ERROR! local_location is not here."); - - // Generate random vector - std::cerr << "Generating test data ..." << std::endl; - test_data = buffer_type ( testdata_size ); - std::cerr << "Test data generated." << std::endl; - for(std::size_t i = 0; i < testdata_size; i++){ - test_data[i] = static_cast(rand()); - } - - + hpx::opencl::device remote_device, bool distributed) { + testdata_size = 1; + + if (num_iterations == 0) num_iterations = 50; + + // Get localities + hpx::naming::id_type remote_location = + hpx::get_colocation_id(hpx::launch::sync, remote_device.get_id()); + hpx::naming::id_type local_location = + hpx::get_colocation_id(hpx::launch::sync, local_device.get_id()); + if (local_location != hpx::find_here()) + die("Internal ERROR! local_location is not here."); + + // Generate random vector + std::cerr << "Generating test data ..." << std::endl; + test_data = buffer_type(testdata_size); + std::cerr << "Test data generated." << std::endl; + for (std::size_t i = 0; i < testdata_size; i++) { + test_data[i] = static_cast(rand()); + } + + // Run write test + write_test(local_device, false); + write_test(local_device, true); + + // Run read test + read_test(local_device, false); + read_test(local_device, true); + + // Run read test + send_test(local_device, local_device, false); + send_test(local_device, local_device, true); + + // Run wait test + wait_test(local_device); + + if (distributed) { // Run write test - write_test(local_device, false); - write_test(local_device, true); + write_test(remote_device, false); + write_test(remote_device, true); // Run read test - read_test(local_device, false); - read_test(local_device, true); + read_test(remote_device, false); + read_test(remote_device, true); // Run read test - send_test(local_device, local_device, false); - send_test(local_device, local_device, true); + send_test(local_device, remote_device, false); + send_test(local_device, remote_device, true); + send_test(remote_device, local_device, false); + send_test(remote_device, local_device, true); + send_test(remote_device, remote_device, false); + send_test(remote_device, remote_device, true); // Run wait test - wait_test(local_device); - - if(distributed){ - - // Run write test - write_test(remote_device, false); - write_test(remote_device, true); - - // Run read test - read_test(remote_device, false); - read_test(remote_device, true); - - // Run read test - send_test(local_device, remote_device, false); - send_test(local_device, remote_device, true); - send_test(remote_device, local_device, false); - send_test(remote_device, local_device, true); - send_test(remote_device, remote_device, false); - send_test(remote_device, remote_device, true); - - // Run wait test - wait_test(remote_device); - - } - - + wait_test(remote_device); + } } - - - diff --git a/tests/performance/opencl/util/cl_tests.hpp b/tests/performance/opencl/util/cl_tests.hpp index 3a8d3e80..f791bac4 100644 --- a/tests/performance/opencl/util/cl_tests.hpp +++ b/tests/performance/opencl/util/cl_tests.hpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include #include #include @@ -15,9 +14,9 @@ #include "testresults.hpp" -using boost::program_options::variables_map; using boost::program_options::options_description; using boost::program_options::value; +using boost::program_options::variables_map; // the formatter for the test results hpx::opencl::tests::performance::testresults results; @@ -26,190 +25,164 @@ hpx::opencl::tests::performance::testresults results; static std::size_t num_iterations = 0; static std::size_t testdata_size = 0; - - // the main test function static void cl_test(hpx::opencl::device, hpx::opencl::device, bool distributed); -#define die( message ) \ -{ \ - HPX_THROW_EXCEPTION(hpx::no_success, "die()", (message)); \ -} - -#define CREATE_BUFFER(name, data) \ - static const buffer_type name(data, sizeof(data), \ - buffer_type::init_mode::reference) - -#define COMPARE_RESULT_INT( result_data, correct_result ) \ -{ \ - auto lhs = result_data; \ - auto rhs = correct_result; \ - if(lhs.size() != rhs.size()){ \ - die("Result is incorrect! (Sizes don't match)"); \ - } \ - for(std::size_t i = 0; i < lhs.size(); i++){ \ - std::cerr << std::hex << lhs[i] << "-" << rhs[i] << std::endl; \ - if(lhs[i] != rhs[i]){ \ - die("Result is incorrect!"); \ - } \ -} +#define die(message) \ + { HPX_THROW_EXCEPTION(hpx::no_success, "die()", (message)); } + +#define CREATE_BUFFER(name, data) \ + static const buffer_type name(data, sizeof(data), \ + buffer_type::init_mode::reference) + +#define COMPARE_RESULT_INT(result_data, correct_result) \ + { \ + auto lhs = result_data; \ + auto rhs = correct_result; \ + if (lhs.size() != rhs.size()) { \ + die("Result is incorrect! (Sizes don't match)"); \ + } \ + for (std::size_t i = 0; i < lhs.size(); i++) { \ + std::cerr << std::hex << lhs[i] << "-" << rhs[i] << std::endl; \ + if (lhs[i] != rhs[i]) { \ + die("Result is incorrect!"); \ + } \ + } typedef hpx::serialization::serialize_buffer buffer_type; typedef hpx::serialization::serialize_buffer intbuffer_type; -std::string to_string(buffer_type buf){ - std::size_t length = 0; - while(length < buf.size()) - { - if(buf[length] == '\0') break; - length++; - } - return std::string(buf.data(), buf.data() + length); -} - -#define COMPARE_RESULT( result_data, correct_result ) \ -{ \ - auto lhs = result_data; \ - auto rhs = correct_result; \ - if(lhs.size() != rhs.size()){ \ - die("Result is incorrect! (Sizes don't match)"); \ - } \ - std::string correct_string = to_string(rhs); \ - std::string result_string = to_string(lhs); \ - if(correct_string != result_string){ \ - die("Result is incorrect!"); \ - } \ +std::string to_string(buffer_type buf) { + std::size_t length = 0; + while (length < buf.size()) { + if (buf[length] == '\0') break; + length++; + } + return std::string(buf.data(), buf.data() + length); } -static void print_testdevice_info(hpx::opencl::device & cldevice, +#define COMPARE_RESULT(result_data, correct_result) \ + { \ + auto lhs = result_data; \ + auto rhs = correct_result; \ + if (lhs.size() != rhs.size()) { \ + die("Result is incorrect! (Sizes don't match)"); \ + } \ + std::string correct_string = to_string(rhs); \ + std::string result_string = to_string(lhs); \ + if (correct_string != result_string) { \ + die("Result is incorrect!"); \ + } \ + } + +static void print_testdevice_info(hpx::opencl::device& cldevice, std::size_t device_id, - std::size_t num_devices){ - - // Test whether get_device_info works - std::string version = cldevice.get_device_info().get(); - - // Write Info Code - std::cerr << "Device ID: " << device_id << " / " << num_devices - << std::endl; - std::cerr << "Device GID: " << cldevice.get_id() << std::endl; - std::cerr << "Version: " << version << std::endl; - std::cerr << "Name: " << cldevice.get_device_info().get() - << std::endl; - std::cerr << "Vendor: " << cldevice.get_device_info().get() - << std::endl; - std::cerr << "Profile: " << cldevice.get_device_info().get() - << std::endl; + std::size_t num_devices) { + // Test whether get_device_info works + std::string version = cldevice.get_device_info().get(); + + // Write Info Code + std::cerr << "Device ID: " << device_id << " / " << num_devices << std::endl; + std::cerr << "Device GID: " << cldevice.get_id() << std::endl; + std::cerr << "Version: " << version << std::endl; + std::cerr << "Name: " + << cldevice.get_device_info().get() << std::endl; + std::cerr << "Vendor: " + << cldevice.get_device_info().get() << std::endl; + std::cerr << "Profile: " + << cldevice.get_device_info().get() << std::endl; } -static std::vector init(variables_map & vm) -{ - - std::size_t device_id = 0; - - if (vm.count("deviceid")) - device_id = vm["deviceid"].as(); - - // Try to get remote devices - std::vector remote_devices - = hpx::opencl::create_remote_devices( CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); - std::vector local_devices - = hpx::opencl::create_local_devices( CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); +static std::vector init(variables_map& vm) { + std::size_t device_id = 0; + + if (vm.count("deviceid")) device_id = vm["deviceid"].as(); + + // Try to get remote devices + std::vector remote_devices = + hpx::opencl::create_remote_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1") + .get(); + std::vector local_devices = + hpx::opencl::create_local_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1").get(); + + if (remote_devices.empty()) { + remote_devices = local_devices; + std::cerr << "WARNING: no remote devices found!" << std::endl; + } + if (local_devices.empty()) die("No local devices found!"); + if (remote_devices.empty()) die("No remote devices found!"); + if (local_devices.size() <= device_id || remote_devices.size() <= device_id) + die("deviceid is out of range!"); + + // Choose device + hpx::opencl::device local_device = local_devices[device_id]; + hpx::opencl::device remote_device = remote_devices[device_id]; + + // Print info + std::cerr << "Local device:" << std::endl; + print_testdevice_info(local_device, device_id, local_devices.size()); + if (local_device.get_id() != remote_device.get_id()) { + std::cerr << "Remote device:" << std::endl; + print_testdevice_info(remote_device, device_id, remote_devices.size()); + } + + // return the devices + std::vector devices; + devices.push_back(local_device); + devices.push_back(remote_device); + return devices; +} - if(remote_devices.empty()){ - remote_devices = local_devices; - std::cerr << "WARNING: no remote devices found!" << std::endl; +int hpx_main(variables_map& vm) { + { + if (vm.count("format")) { + std::string format = vm["format"].as(); + if (format == "json") + results.set_output_json(); + else if (format == "tabbed") + results.set_output_tabbed(); + else + die("Format '" + format + "' not supported!"); + } + if (vm.count("enable")) { + results.set_enabled_tests(vm["enable"].as >()); } - if(local_devices.empty()) die("No local devices found!"); - if(remote_devices.empty()) die("No remote devices found!"); - if(local_devices.size() <= device_id || remote_devices.size() <= device_id) - die("deviceid is out of range!"); - - // Choose device - hpx::opencl::device local_device = local_devices[device_id]; - hpx::opencl::device remote_device = remote_devices[device_id]; - - // Print info - std::cerr << "Local device:" << std::endl; - print_testdevice_info(local_device, device_id, local_devices.size()); - if(local_device.get_id() != remote_device.get_id()) - { - std::cerr << "Remote device:" << std::endl; - print_testdevice_info(remote_device, device_id, remote_devices.size()); + if (vm.count("size")) { + testdata_size = vm["size"].as(); + } + if (vm.count("iterations")) { + num_iterations = vm["iterations"].as(); } - // return the devices - std::vector devices; - devices.push_back(local_device); - devices.push_back(remote_device); - return devices; + auto devices = init(vm); -} + std::cerr << std::endl; + cl_test(devices[0], devices[1], devices[0].get_id() != devices[1].get_id()); + std::cerr << std::endl; -int hpx_main(variables_map & vm) -{ - { - if (vm.count("format")){ - std::string format = vm["format"].as(); - if(format == "json") - results.set_output_json(); - else if (format == "tabbed") - results.set_output_tabbed(); - else - die("Format '" + format + "' not supported!"); - } - if (vm.count("enable")){ - results.set_enabled_tests( vm["enable"] - .as >() ); - } - if (vm.count("size")){ - testdata_size = vm["size"].as(); - } - if (vm.count("iterations")){ - num_iterations = vm["iterations"].as(); - } - - auto devices = init(vm); - - std::cerr << std::endl; - cl_test( devices[0], devices[1], - devices[0].get_id() != devices[1].get_id()); - std::cerr << std::endl; - - std::cout << results; - - std::cerr << std::endl; - } - - hpx::finalize(); - return hpx::util::report_errors(); -} + std::cout << results; + std::cerr << std::endl; + } + hpx::finalize(); + return hpx::util::report_errors(); +} /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char* argv[]) -{ - // Configure application-specific options - options_description cmdline("Usage: " HPX_APPLICATION_STRING " [options]"); - cmdline.add_options() - ( "deviceid" - , value()->default_value(0) - , "the ID of the device we will run our tests on" ) - ( "iterations" - , value()->default_value(0) - , "the number of iterations every test shall get executed" ) - ( "size" - , value()->default_value(0) - , "the size of the test data" ) - ( "format" - , value() - , "Formats the output in a certain way.\nSupports: json, tabbed" ) - ( "enable" - , value >() - , "only enables certain tests" ) - ; - - return hpx::init(cmdline, argc, argv); +int main(int argc, char* argv[]) { + // Configure application-specific options + options_description cmdline("Usage: " HPX_APPLICATION_STRING " [options]"); + cmdline.add_options()("deviceid", value()->default_value(0), + "the ID of the device we will run our tests on")( + "iterations", value()->default_value(0), + "the number of iterations every test shall get executed")( + "size", value()->default_value(0), + "the size of the test data")( + "format", value(), + "Formats the output in a certain way.\nSupports: json, tabbed")( + "enable", value >(), + "only enables certain tests"); + + return hpx::init(cmdline, argc, argv); } diff --git a/tests/performance/opencl/util/testresults.cpp b/tests/performance/opencl/util/testresults.cpp index 5c188b20..13a3fefa 100644 --- a/tests/performance/opencl/util/testresults.cpp +++ b/tests/performance/opencl/util/testresults.cpp @@ -13,411 +13,341 @@ using hpx::opencl::tests::performance::testresults; -void -testresults::set_enabled_tests( std::vector enabled_tests ) -{ - - enabled_tests_set.insert(enabled_tests.begin(), enabled_tests.end()); - +void testresults::set_enabled_tests(std::vector enabled_tests) { + enabled_tests_set.insert(enabled_tests.begin(), enabled_tests.end()); } -void -testresults::set_output_json() -{ - output_format = JSON; -} - -void -testresults::set_output_tabbed() -{ - output_format = TABBED; -} +void testresults::set_output_json() { output_format = JSON; } -void -testresults::start_test( std::string name, - std::string unit, - std::map atts ) -{ +void testresults::set_output_tabbed() { output_format = TABBED; } - std::cerr << "Running '" << name << "' " << std::flush; +void testresults::start_test(std::string name, std::string unit, + std::map atts) { + std::cerr << "Running '" << name << "' " << std::flush; - // If test is allowed to run - if( enabled_tests_set.empty() || - enabled_tests_set.find(name) != enabled_tests_set.end()) { + // If test is allowed to run + if (enabled_tests_set.empty() || + enabled_tests_set.find(name) != enabled_tests_set.end()) { + testseries new_test; - testseries new_test; - - new_test.series_name = name; - new_test.atts = atts; - new_test.unit = unit; - - results.push_back(new_test); + new_test.series_name = name; + new_test.atts = atts; + new_test.unit = unit; - current_test_valid = true; + results.push_back(new_test); - } else { - - std::cerr << "- disabled" << std::endl; - current_test_valid = false; + current_test_valid = true; - } + } else { + std::cerr << "- disabled" << std::endl; + current_test_valid = false; + } } -void -testresults::add( double result ) -{ - std::cerr << "." << std::flush; - results.back().test_entries.push_back(result); - if(results.back().test_entries.size() >= 10){ - std::cerr << std::endl; - } +void testresults::add(double result) { + std::cerr << "." << std::flush; + results.back().test_entries.push_back(result); + if (results.back().test_entries.size() >= 10) { + std::cerr << std::endl; + } } -bool -testresults::needs_more_testing() -{ - if(!current_test_valid) - return false; +bool testresults::needs_more_testing() { + if (!current_test_valid) return false; - if(results.back().test_entries.size() >= 10) - return false; + if (results.back().test_entries.size() >= 10) return false; - return true; + return true; } -static std::size_t -get_column_width( const std::vector > & matrix, - std::size_t column ) -{ - std::size_t max_width = 0; - - for(const auto& row : matrix){ - - if(row.size() <= column) - continue; - - if(row[column].length() > max_width) - max_width = row[column].length(); +static std::size_t get_column_width( + const std::vector >& matrix, std::size_t column) { + std::size_t max_width = 0; - } + for (const auto& row : matrix) { + if (row.size() <= column) continue; - return max_width; + if (row[column].length() > max_width) max_width = row[column].length(); + } + return max_width; } -static std::string -double_to_str(double num) -{ - std::stringstream str; - str << num; - return str.str(); +static std::string double_to_str(double num) { + std::stringstream str; + str << num; + return str.str(); } -void -testresults::print_default( std::ostream& os ) const -{ - std::vector > output; - - std::vector headline; - headline.push_back("test"); - headline.push_back("atts"); - headline.push_back("units"); - headline.push_back("median"); - headline.push_back("mean"); - headline.push_back("stddev"); - headline.push_back("min"); - headline.push_back("max"); - headline.push_back("trial0"); - headline.push_back("trial1"); - headline.push_back("trial2"); - headline.push_back("trial3"); - headline.push_back("trial4"); - headline.push_back("trial5"); - headline.push_back("trial6"); - headline.push_back("trial7"); - headline.push_back("trial8"); - headline.push_back("trial9"); - output.push_back(headline); - - // fill with results - for(const auto& row : results){ - - if(row.test_entries.size() != 10) - { - os << "ERROR! test_entries.size() != 10!" << std::endl; - break; - } - - std::vector line; - - line.push_back(row.series_name); - line.push_back(row.get_atts()); - line.push_back(row.unit); - line.push_back(double_to_str(row.get_median())); - line.push_back(double_to_str(row.get_mean())); - line.push_back(double_to_str(row.get_stddev())); - line.push_back(double_to_str(row.get_min())); - line.push_back(double_to_str(row.get_max())); - - for(const auto& res : row.test_entries){ - line.push_back(double_to_str(res)); - } - - output.push_back(line); - } - - // compute widths of columns - std::vector column_widths; - for(std::size_t col = 0; col < headline.size(); col++){ - column_widths.push_back(get_column_width(output,col)); +void testresults::print_default(std::ostream& os) const { + std::vector > output; + + std::vector headline; + headline.push_back("test"); + headline.push_back("atts"); + headline.push_back("units"); + headline.push_back("median"); + headline.push_back("mean"); + headline.push_back("stddev"); + headline.push_back("min"); + headline.push_back("max"); + headline.push_back("trial0"); + headline.push_back("trial1"); + headline.push_back("trial2"); + headline.push_back("trial3"); + headline.push_back("trial4"); + headline.push_back("trial5"); + headline.push_back("trial6"); + headline.push_back("trial7"); + headline.push_back("trial8"); + headline.push_back("trial9"); + output.push_back(headline); + + // fill with results + for (const auto& row : results) { + if (row.test_entries.size() != 10) { + os << "ERROR! test_entries.size() != 10!" << std::endl; + break; } - // print - for(const auto& row : output){ - for(std::size_t i = 0; i < row.size() && i < column_widths.size(); i++){ - if(i != 0) - os << " "; + std::vector line; - os << row[i]; + line.push_back(row.series_name); + line.push_back(row.get_atts()); + line.push_back(row.unit); + line.push_back(double_to_str(row.get_median())); + line.push_back(double_to_str(row.get_mean())); + line.push_back(double_to_str(row.get_stddev())); + line.push_back(double_to_str(row.get_min())); + line.push_back(double_to_str(row.get_max())); - for(std::size_t j = row[i].length(); j < column_widths[i]; j++){ - os << " "; - } - } - os << std::endl; + for (const auto& res : row.test_entries) { + line.push_back(double_to_str(res)); } -} + output.push_back(line); + } -void -testresults::print_tabbed( std::ostream& os ) const -{ - // print headline - os << "test\tatts\tunits\tmedian\tmean\tstddev\tmin\tmax\ttrial0\ttrial1\t" - << "trial2\ttrial3\ttrial4\ttrial5\ttrial6\ttrial7\ttrial8\ttrial9" - << std::endl; + // compute widths of columns + std::vector column_widths; + for (std::size_t col = 0; col < headline.size(); col++) { + column_widths.push_back(get_column_width(output, col)); + } - for(const auto& row : results){ + // print + for (const auto& row : output) { + for (std::size_t i = 0; i < row.size() && i < column_widths.size(); i++) { + if (i != 0) os << " "; - if(row.test_entries.size() != 10) - { - os << "ERROR! test_entries.size() != 10!" << std::endl; - break; - } + os << row[i]; - os << row.series_name << "\t"; + for (std::size_t j = row[i].length(); j < column_widths[i]; j++) { + os << " "; + } + } + os << std::endl; + } +} - os << row.get_atts() << "\t"; +void testresults::print_tabbed(std::ostream& os) const { + // print headline + os << "test\tatts\tunits\tmedian\tmean\tstddev\tmin\tmax\ttrial0\ttrial1\t" + << "trial2\ttrial3\ttrial4\ttrial5\ttrial6\ttrial7\ttrial8\ttrial9" + << std::endl; - os << row.unit << "\t"; + for (const auto& row : results) { + if (row.test_entries.size() != 10) { + os << "ERROR! test_entries.size() != 10!" << std::endl; + break; + } - os << row.get_median() << "\t"; - os << row.get_mean() << "\t"; - os << row.get_stddev() << "\t"; - os << row.get_min() << "\t"; - os << row.get_max() << "\t"; + os << row.series_name << "\t"; - bool is_first = true; - for(const auto& res : row.test_entries){ - if(is_first){ - is_first = false; - } else { - os << "\t"; - } - os << res; - } + os << row.get_atts() << "\t"; - os << std::endl; + os << row.unit << "\t"; + + os << row.get_median() << "\t"; + os << row.get_mean() << "\t"; + os << row.get_stddev() << "\t"; + os << row.get_min() << "\t"; + os << row.get_max() << "\t"; + bool is_first = true; + for (const auto& res : row.test_entries) { + if (is_first) { + is_first = false; + } else { + os << "\t"; + } + os << res; } + os << std::endl; + } } -void -testresults::print_json( std::ostream& os ) const -{ - os << "{" << std::endl; - os << " \"tests\": {" << std::endl; +void testresults::print_json(std::ostream& os) const { + os << "{" << std::endl; + os << " \"tests\": {" << std::endl; - for(std::size_t i = 0; i < results.size(); i++){ - const auto& row = results[i]; - os << " \"" << row.series_name << "\": {" << std::endl; + for (std::size_t i = 0; i < results.size(); i++) { + const auto& row = results[i]; + os << " \"" << row.series_name << "\": {" << std::endl; - HPX_ASSERT(row.test_entries.size() == 10); + HPX_ASSERT(row.test_entries.size() == 10); - // atts - os << " \"atts\": {" << std::endl; - for( auto it = row.atts.begin(); it != row.atts.end(); ){ - os << " \"" << it->first << "\": \"" << it->second << "\""; + // atts + os << " \"atts\": {" << std::endl; + for (auto it = row.atts.begin(); it != row.atts.end();) { + os << " \"" << it->first << "\": \"" << it->second << "\""; - it++; + it++; - if(it != row.atts.end()) - os << ","; + if (it != row.atts.end()) os << ","; - os << std::endl; - } - os << " }," << std::endl; - - // unit - os << " \"unit\": \"" << row.unit << "\"," << std::endl; + os << std::endl; + } + os << " }," << std::endl; - // stats - os << " \"median\": " << row.get_median() << "," << std::endl; - os << " \"mean\": " << row.get_mean() << "," << std::endl; - os << " \"stddev\": " << row.get_stddev() << "," << std::endl; - os << " \"min\": " << row.get_min() << "," << std::endl; - os << " \"max\": " << row.get_max() << "," << std::endl; + // unit + os << " \"unit\": \"" << row.unit << "\"," << std::endl; - // trials - os << " \"trials\": [" << std::endl; - for(std::size_t j = 0; j < row.test_entries.size(); j++){ - const auto& res = row.test_entries[j]; + // stats + os << " \"median\": " << row.get_median() << "," << std::endl; + os << " \"mean\": " << row.get_mean() << "," << std::endl; + os << " \"stddev\": " << row.get_stddev() << "," << std::endl; + os << " \"min\": " << row.get_min() << "," << std::endl; + os << " \"max\": " << row.get_max() << "," << std::endl; - os << " " << res; + // trials + os << " \"trials\": [" << std::endl; + for (std::size_t j = 0; j < row.test_entries.size(); j++) { + const auto& res = row.test_entries[j]; - if(j < row.test_entries.size() - 1) - os << "," << std::endl; - else - os << std::endl; - } - os << " ]" << std::endl; + os << " " << res; - if(i < results.size() - 1) - os << " }," << std::endl; - else - os << " }" << std::endl; + if (j < row.test_entries.size() - 1) + os << "," << std::endl; + else + os << std::endl; } + os << " ]" << std::endl; - os << " }" << std::endl; - os << "}" << std::endl; -} + if (i < results.size() - 1) + os << " }," << std::endl; + else + os << " }" << std::endl; + } -std::ostream& -hpx::opencl::tests::performance::operator<<(std::ostream& os, const testresults& result) -{ - - switch(result.output_format){ - case testresults::DEFAULT: result.print_default(os); break; - case testresults::TABBED: result.print_tabbed(os); break; - case testresults::JSON: result.print_json(os); break; - default: - HPX_ASSERT(false); - break; - } + os << " }" << std::endl; + os << "}" << std::endl; +} - return os; +std::ostream& hpx::opencl::tests::performance::operator<<( + std::ostream& os, const testresults& result) { + switch (result.output_format) { + case testresults::DEFAULT: + result.print_default(os); + break; + case testresults::TABBED: + result.print_tabbed(os); + break; + case testresults::JSON: + result.print_json(os); + break; + default: + HPX_ASSERT(false); + break; + } + + return os; } -double -testresults::testseries::get_min() const -{ - double min = test_entries[0]; - - for(const double& val : test_entries){ - if(val < min) - min = val; - } +double testresults::testseries::get_min() const { + double min = test_entries[0]; + + for (const double& val : test_entries) { + if (val < min) min = val; + } - return min; + return min; } -double -testresults::testseries::get_max() const -{ - double max = test_entries[0]; - - for(const double& val : test_entries){ - if(val > max) - max = val; - } +double testresults::testseries::get_max() const { + double max = test_entries[0]; + + for (const double& val : test_entries) { + if (val > max) max = val; + } - return max; + return max; } -double -testresults::testseries::get_stddev() const -{ - double mean = get_mean(); +double testresults::testseries::get_stddev() const { + double mean = get_mean(); - double sum = 0; + double sum = 0; - for(const double& val : test_entries){ - double diff = val - mean; - sum += (diff*diff); - } + for (const double& val : test_entries) { + double diff = val - mean; + sum += (diff * diff); + } - return sum/test_entries.size(); + return sum / test_entries.size(); } -double -testresults::testseries::get_mean() const -{ +double testresults::testseries::get_mean() const { + double sum = 0; - double sum = 0; - - for(const double& val : test_entries){ - sum += val; - } + for (const double& val : test_entries) { + sum += val; + } - return sum/test_entries.size(); + return sum / test_entries.size(); } -double -testresults::testseries::get_median() const -{ - - double median; - std::size_t size = test_entries.size(); +double testresults::testseries::get_median() const { + double median; + std::size_t size = test_entries.size(); - if(size == 0) - return 0; + if (size == 0) return 0; - if(size == 1) - return test_entries[0]; + if (size == 1) return test_entries[0]; - std::vector sorted_entries(test_entries); - std::sort(sorted_entries.begin(), sorted_entries.end()); + std::vector sorted_entries(test_entries); + std::sort(sorted_entries.begin(), sorted_entries.end()); - if (size % 2 == 0) - { - median = (sorted_entries[size / 2 - 1] + sorted_entries[size / 2]) / 2; - } - else - { - median = sorted_entries[size / 2]; - } + if (size % 2 == 0) { + median = (sorted_entries[size / 2 - 1] + sorted_entries[size / 2]) / 2; + } else { + median = sorted_entries[size / 2]; + } - return median; + return median; } -std::string -testresults::testseries::get_atts() const -{ +std::string testresults::testseries::get_atts() const { + if (atts.empty()) return "-"; - if(atts.empty()) - return "-"; + // get all keys + std::vector keys; + for (const auto& it : atts) { + keys.push_back(it.first); + } - // get all keys - std::vector keys; - for( const auto& it : atts ){ - keys.push_back(it.first); - } - - // sort keys - std::sort(keys.begin(), keys.end()); + // sort keys + std::sort(keys.begin(), keys.end()); - // print values - std::string result = ""; - for( const auto& key : keys ){ - if(result.size() > 0) - result += " "; - - result += key; - result += "="; - result += atts.at(key); - } + // print values + std::string result = ""; + for (const auto& key : keys) { + if (result.size() > 0) result += " "; - return result; + result += key; + result += "="; + result += atts.at(key); + } + return result; } diff --git a/tests/performance/opencl/util/testresults.hpp b/tests/performance/opencl/util/testresults.hpp index c5d0de02..e798d1ce 100644 --- a/tests/performance/opencl/util/testresults.hpp +++ b/tests/performance/opencl/util/testresults.hpp @@ -9,67 +9,67 @@ #include #include - #ifndef HPX_OPENCL_TESTS_PERFORMANCE_TESTRESULTS_HPP_ #define HPX_OPENCL_TESTS_PERFORMANCE_TESTRESULTS_HPP_ -namespace hpx{ namespace opencl{ namespace tests{ namespace performance{ +namespace hpx { +namespace opencl { +namespace tests { +namespace performance { - class testresults - { - private: - class testseries{ - public: - std::vector test_entries; - std::string series_name; - std::map atts; - std::string unit; - double get_median() const; - double get_mean() const; - double get_stddev() const; - double get_min() const; - double get_max() const; - std::string get_atts() const; - }; +class testresults { + private: + class testseries { + public: + std::vector test_entries; + std::string series_name; + std::map atts; + std::string unit; + double get_median() const; + double get_mean() const; + double get_stddev() const; + double get_min() const; + double get_max() const; + std::string get_atts() const; + }; - std::vector results; + std::vector results; - public: - void set_enabled_tests( std::vector enabled_tests ); + public: + void set_enabled_tests(std::vector enabled_tests); - void set_output_json(); - void set_output_tabbed(); + void set_output_json(); + void set_output_tabbed(); - void start_test( std::string name, - std::string unit, - std::map atts - = std::map() ); + void start_test(std::string name, std::string unit, + std::map atts = + std::map()); - void add( double result ); + void add(double result); - bool needs_more_testing(); + bool needs_more_testing(); - private: - void print_default( std::ostream& os ) const; - void print_tabbed( std::ostream& os ) const; - void print_json( std::ostream& os ) const; + private: + void print_default(std::ostream& os) const; + void print_tabbed(std::ostream& os) const; + void print_json(std::ostream& os) const; - friend std::ostream& operator<<( std::ostream& os, - const testresults& result ); + friend std::ostream& operator<<(std::ostream& os, const testresults& result); - bool current_test_valid = false; + bool current_test_valid = false; - // Settings - std::set enabled_tests_set; + // Settings + std::set enabled_tests_set; - enum output_formats { DEFAULT, TABBED, JSON }; - output_formats output_format = DEFAULT; + enum output_formats { DEFAULT, TABBED, JSON }; + output_formats output_format = DEFAULT; +}; - }; +std::ostream& operator<<(std::ostream& os, const testresults& result); - std::ostream& operator<<(std::ostream& os, const testresults& result); +} // namespace performance +} // namespace tests +} // namespace opencl +} // namespace hpx -}}}} - - #endif diff --git a/tests/unit/opencl/buffer_read_write.cpp b/tests/unit/opencl/buffer_read_write.cpp index 029f48b5..3396c63e 100644 --- a/tests/unit/opencl/buffer_read_write.cpp +++ b/tests/unit/opencl/buffer_read_write.cpp @@ -3,19 +3,14 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" - /* * This test is meant to verify the buffer read and buffer write functionality. */ - - #define DATASIZE (sizeof("Hello World!")) - CREATE_BUFFER(initdata, "Hello World!"); CREATE_BUFFER(refdata1, "Help, World!"); CREATE_BUFFER(refdata2, "World"); @@ -23,140 +18,126 @@ CREATE_BUFFER(refdata3, "Hello Wolp,!"); CREATE_BUFFER(refdata4, "HDEFGjihgld!"); CREATE_BUFFER(refdata5, "Helello rld!"); - static const buffer_type modifydata("p,", 2, buffer_type::init_mode::reference); static const uint32_t intarr[] = {0x47464544, 0x6768696a}; static const intbuffer_type modifydata2(intarr, 2, intbuffer_type::init_mode::reference); - -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device remote_device ) -{ - - hpx::opencl::buffer buffer = - remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - hpx::opencl::buffer buffer2 = - remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - hpx::opencl::buffer remote_buffer = - local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - // test if buffer initialization worked - size_t buffer_size = buffer.size().get(); - HPX_TEST_EQ(buffer_size, DATASIZE); - - // test if buffer can be written to - { - auto data_write_future = buffer.enqueue_write(0, initdata); - data_write_future.get(); - } - - // test when_all - { - auto future1 = buffer.enqueue_write(0, initdata); - auto future2 = buffer2.enqueue_write(0, initdata); - - std::vector > futures; - futures.push_back(std::move(future1)); - futures.push_back(std::move(future2)); - - hpx::when_all(futures).get(); - } - - // test local continuation - { - auto data_write_future = buffer.enqueue_write(0, initdata); - auto future2 = data_write_future.then( - [](hpx::future fut){ - return true; - } - ); - HPX_TEST(future2.get()); - } - - // test read - { - auto data_read_future = buffer.enqueue_read(0, DATASIZE); - - COMPARE_RESULT(data_read_future.get(), initdata); - } - - // test remote continuation - { - auto data_write_future = buffer.enqueue_write(3, modifydata); - auto data_read_future = buffer.enqueue_read(0, DATASIZE, - data_write_future); - - COMPARE_RESULT(data_read_future.get(), refdata1); - } - - // test read continuation and non-char buffer writes and offsets - { - auto data_read_future1 = buffer.enqueue_read(3, 2); - auto data_write_future = buffer.enqueue_write(1, modifydata2, - data_read_future1); - auto data_read_future2 = buffer.enqueue_read(0, DATASIZE, - data_write_future); - - COMPARE_RESULT(data_read_future1.get(), modifydata); - COMPARE_RESULT(data_read_future2.get(), refdata4); - } - - // test read to buffer - { - intbuffer_type readbuffer( 2 ); - - auto data_read_future = buffer.enqueue_read(1, readbuffer); - - auto result_buffer = data_read_future.get(); - - HPX_TEST(readbuffer.data() == result_buffer.data()); - HPX_TEST(readbuffer.size() == result_buffer.size()); - - COMPARE_RESULT_INT(result_buffer, modifydata2); - } - - // test buffer-buffer copy local - { - // write to both src and dst buffer - auto data_write_future = buffer.enqueue_write(0, initdata); - auto data_write_future2 = buffer2.enqueue_write(0, initdata); - - // send src to dst buffer with offset - auto futures = buffer.enqueue_send( buffer2, 1, 3, 5, - data_write_future, - data_write_future2); - - // read the src and dst buffer - auto src_data = buffer.enqueue_read(0, DATASIZE, futures.src_future); - auto dst_data = buffer2.enqueue_read(0, DATASIZE, futures.dst_future); - - COMPARE_RESULT(src_data.get(), initdata); - COMPARE_RESULT(dst_data.get(), refdata5); - } - - // test buffer-buffer copy remote - { - // write to both src and dst buffer - auto data_write_future = buffer.enqueue_write(0, initdata); - auto data_write_future2 = remote_buffer.enqueue_write(0, initdata); - - // send src to dst buffer with offset - auto futures = buffer.enqueue_send( remote_buffer, 1, 3, 5, - data_write_future, - data_write_future2); - - // read the src and dst buffer - auto src_data = buffer.enqueue_read(0, DATASIZE, futures.src_future); - auto dst_data = remote_buffer.enqueue_read(0, DATASIZE, futures.dst_future); - - COMPARE_RESULT(src_data.get(), initdata); - COMPARE_RESULT(dst_data.get(), refdata5); - } - - +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device remote_device) { + hpx::opencl::buffer buffer = + remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + hpx::opencl::buffer buffer2 = + remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + hpx::opencl::buffer remote_buffer = + local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + // test if buffer initialization worked + size_t buffer_size = buffer.size().get(); + HPX_TEST_EQ(buffer_size, DATASIZE); + + // test if buffer can be written to + { + auto data_write_future = buffer.enqueue_write(0, initdata); + data_write_future.get(); + } + + // test when_all + { + auto future1 = buffer.enqueue_write(0, initdata); + auto future2 = buffer2.enqueue_write(0, initdata); + + std::vector > futures; + futures.push_back(std::move(future1)); + futures.push_back(std::move(future2)); + + hpx::when_all(futures).get(); + } + + // test local continuation + { + auto data_write_future = buffer.enqueue_write(0, initdata); + auto future2 = + data_write_future.then([](hpx::future fut) { return true; }); + HPX_TEST(future2.get()); + } + + // test read + { + auto data_read_future = buffer.enqueue_read(0, DATASIZE); + + COMPARE_RESULT(data_read_future.get(), initdata); + } + + // test remote continuation + { + auto data_write_future = buffer.enqueue_write(3, modifydata); + auto data_read_future = buffer.enqueue_read(0, DATASIZE, data_write_future); + + COMPARE_RESULT(data_read_future.get(), refdata1); + } + + // test read continuation and non-char buffer writes and offsets + { + auto data_read_future1 = buffer.enqueue_read(3, 2); + auto data_write_future = + buffer.enqueue_write(1, modifydata2, data_read_future1); + auto data_read_future2 = + buffer.enqueue_read(0, DATASIZE, data_write_future); + + COMPARE_RESULT(data_read_future1.get(), modifydata); + COMPARE_RESULT(data_read_future2.get(), refdata4); + } + + // test read to buffer + { + intbuffer_type readbuffer(2); + + auto data_read_future = buffer.enqueue_read(1, readbuffer); + + auto result_buffer = data_read_future.get(); + + HPX_TEST(readbuffer.data() == result_buffer.data()); + HPX_TEST(readbuffer.size() == result_buffer.size()); + + COMPARE_RESULT_INT(result_buffer, modifydata2); + } + + // test buffer-buffer copy local + { + // write to both src and dst buffer + auto data_write_future = buffer.enqueue_write(0, initdata); + auto data_write_future2 = buffer2.enqueue_write(0, initdata); + + // send src to dst buffer with offset + auto futures = buffer.enqueue_send(buffer2, 1, 3, 5, data_write_future, + data_write_future2); + + // read the src and dst buffer + auto src_data = buffer.enqueue_read(0, DATASIZE, futures.src_future); + auto dst_data = buffer2.enqueue_read(0, DATASIZE, futures.dst_future); + + COMPARE_RESULT(src_data.get(), initdata); + COMPARE_RESULT(dst_data.get(), refdata5); + } + + // test buffer-buffer copy remote + { + // write to both src and dst buffer + auto data_write_future = buffer.enqueue_write(0, initdata); + auto data_write_future2 = remote_buffer.enqueue_write(0, initdata); + + // send src to dst buffer with offset + auto futures = buffer.enqueue_send(remote_buffer, 1, 3, 5, + data_write_future, data_write_future2); + + // read the src and dst buffer + auto src_data = buffer.enqueue_read(0, DATASIZE, futures.src_future); + auto dst_data = remote_buffer.enqueue_read(0, DATASIZE, futures.dst_future); + + COMPARE_RESULT(src_data.get(), initdata); + COMPARE_RESULT(dst_data.get(), refdata5); + } } - - diff --git a/tests/unit/opencl/buffer_rect_read.cpp b/tests/unit/opencl/buffer_rect_read.cpp index e1ca1490..113e9e03 100644 --- a/tests/unit/opencl/buffer_rect_read.cpp +++ b/tests/unit/opencl/buffer_rect_read.cpp @@ -3,58 +3,131 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" - /* * This test is meant to verify the buffer rect write functionality. */ - // 4x4x4 cube of data -#define INITDATA "0123" "4567" "8902" "4680" \ - "1357" "9152" "6374" "8597" \ - "9876" "5432" "1045" "3627" \ - "1894" "1928" "3465" "8709" - -#define TARGETDATA "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" - -#define REFDATA01 "0123" "4567" "8902" "4680" \ - "1357" "9152" "6374" "8597" \ - "9876" "5432" "1045" "3627" \ - "1894" "1928" "3465" "8709" -hpx::opencl::rect_props props01(0,0,0, 0,0,0, 4,4,4, 4,16, 4,16); - -#define REFDATA02 "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "4567" "1111" "1111" \ - "1111" "1111" "1111" "1111" -hpx::opencl::rect_props props02(0,1,0, 0,1,2, 4,1,1, 4,16, 4,16); - -#define REFDATA03 "1111" "1111" "1111" "1111" \ - "1112" "1114" "1116" "1118" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" -hpx::opencl::rect_props props03(2,0,0, 3,0,1, 1,4,1, 2,0, 4,16); - -#define REFDATA04 "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1101" "1123" \ - "1111" "1111" "1145" "1167" -hpx::opencl::rect_props props04(0,0,0, 2,2,2, 2,2,2, 2,4, 4,16); +#define INITDATA \ + "0123" \ + "4567" \ + "8902" \ + "4680" \ + "1357" \ + "9152" \ + "6374" \ + "8597" \ + "9876" \ + "5432" \ + "1045" \ + "3627" \ + "1894" \ + "1928" \ + "3465" \ + "8709" + +#define TARGETDATA \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" + +#define REFDATA01 \ + "0123" \ + "4567" \ + "8902" \ + "4680" \ + "1357" \ + "9152" \ + "6374" \ + "8597" \ + "9876" \ + "5432" \ + "1045" \ + "3627" \ + "1894" \ + "1928" \ + "3465" \ + "8709" +hpx::opencl::rect_props props01(0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 16, 4, 16); + +#define REFDATA02 \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "4567" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" +hpx::opencl::rect_props props02(0, 1, 0, 0, 1, 2, 4, 1, 1, 4, 16, 4, 16); + +#define REFDATA03 \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1112" \ + "1114" \ + "1116" \ + "1118" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" +hpx::opencl::rect_props props03(2, 0, 0, 3, 0, 1, 1, 4, 1, 2, 0, 4, 16); + +#define REFDATA04 \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1101" \ + "1123" \ + "1111" \ + "1111" \ + "1145" \ + "1167" +hpx::opencl::rect_props props04(0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 4, 4, 16); static const uint16_t refdata05[] = {0x4141, 0x3233, 0x4141, 0x3832, 0x4141, 0x4141, 0x4141, 0x4141}; -hpx::opencl::rect_props props05(1,1,2, 0,1,0, 1,1,2, 2,8, 1,2); - +hpx::opencl::rect_props props05(1, 1, 2, 0, 1, 0, 1, 1, 2, 2, 8, 1, 2); #define DATASIZE (sizeof(INITDATA)) - CREATE_BUFFER(initdata, INITDATA); CREATE_BUFFER(targetdata, TARGETDATA); CREATE_BUFFER(refdata01, REFDATA01); @@ -65,50 +138,45 @@ CREATE_BUFFER(refdata04, REFDATA04); static const uint16_t INT16TARGETDATA[] = {0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4141}; -#define test_read(props, ref) { \ - buffer.enqueue_write(0, initdata).get(); \ - buffer_type target_buffer( TARGETDATA, sizeof(TARGETDATA), \ - buffer_type::init_mode::copy ); \ - auto data_read_future = buffer.enqueue_read_rect(props, target_buffer); \ - COMPARE_RESULT(data_read_future.get(), ref); \ -} - - -#define test_read_int16(props, ref) { \ - buffer.enqueue_write(0, initdata).get(); \ - int16buffer_type target_buffer( INT16TARGETDATA, 8, \ - int16buffer_type::init_mode::copy ); \ - auto data_read_future = buffer.enqueue_read_rect(props, target_buffer); \ - auto result_buffer = data_read_future.get(); \ - HPX_TEST_EQ(result_buffer.size(), target_buffer.size()); \ - for(std::size_t i = 0; i < result_buffer.size(); i++){ \ - HPX_TEST_EQ(result_buffer[i], target_buffer[i]); \ - HPX_TEST_EQ(result_buffer[i], ref[i]); \ - } \ -} - - -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device remote_device ) -{ - - hpx::opencl::buffer buffer = - remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - hpx::opencl::buffer remote_buffer = - local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - // test if buffer initialization worked - size_t buffer_size = buffer.size().get(); - HPX_TEST_EQ(buffer_size, DATASIZE); - - // test if buffer can be read from to - test_read(props01, refdata01); - test_read(props02, refdata02); - test_read(props03, refdata03); - test_read(props04, refdata04); - test_read_int16(props05, refdata05); - +#define test_read(props, ref) \ + { \ + buffer.enqueue_write(0, initdata).get(); \ + buffer_type target_buffer(TARGETDATA, sizeof(TARGETDATA), \ + buffer_type::init_mode::copy); \ + auto data_read_future = buffer.enqueue_read_rect(props, target_buffer); \ + COMPARE_RESULT(data_read_future.get(), ref); \ + } + +#define test_read_int16(props, ref) \ + { \ + buffer.enqueue_write(0, initdata).get(); \ + int16buffer_type target_buffer(INT16TARGETDATA, 8, \ + int16buffer_type::init_mode::copy); \ + auto data_read_future = buffer.enqueue_read_rect(props, target_buffer); \ + auto result_buffer = data_read_future.get(); \ + HPX_TEST_EQ(result_buffer.size(), target_buffer.size()); \ + for (std::size_t i = 0; i < result_buffer.size(); i++) { \ + HPX_TEST_EQ(result_buffer[i], target_buffer[i]); \ + HPX_TEST_EQ(result_buffer[i], ref[i]); \ + } \ + } + +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device remote_device) { + hpx::opencl::buffer buffer = + remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + hpx::opencl::buffer remote_buffer = + local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + // test if buffer initialization worked + size_t buffer_size = buffer.size().get(); + HPX_TEST_EQ(buffer_size, DATASIZE); + + // test if buffer can be read from to + test_read(props01, refdata01); + test_read(props02, refdata02); + test_read(props03, refdata03); + test_read(props04, refdata04); + test_read_int16(props05, refdata05); } - - diff --git a/tests/unit/opencl/buffer_rect_send.cpp b/tests/unit/opencl/buffer_rect_send.cpp index 89ef0316..c888d3ad 100644 --- a/tests/unit/opencl/buffer_rect_send.cpp +++ b/tests/unit/opencl/buffer_rect_send.cpp @@ -3,53 +3,127 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" - /* * This test is meant to verify the buffer rect write functionality. */ - // 4x4x4 cube of data -#define INITDATA "0123" "4567" "8902" "4680" \ - "1357" "9152" "6374" "8597" \ - "9876" "5432" "1045" "3627" \ - "1894" "1928" "3465" "8709" - -#define TARGETDATA "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" - -#define REFDATA01 "0123" "4567" "8902" "4680" \ - "1357" "9152" "6374" "8597" \ - "9876" "5432" "1045" "3627" \ - "1894" "1928" "3465" "8709" -hpx::opencl::rect_props props01(0,0,0, 0,0,0, 4,4,4, 4,16, 4,16); - -#define REFDATA02 "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "4567" "1111" "1111" \ - "1111" "1111" "1111" "1111" -hpx::opencl::rect_props props02(0,1,0, 0,1,2, 4,1,1, 4,16, 4,16); - -#define REFDATA03 "1111" "1111" "1111" "1111" \ - "1112" "1114" "1116" "1118" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" -hpx::opencl::rect_props props03(2,0,0, 3,0,1, 1,4,1, 2,0, 4,16); - -#define REFDATA04 "1111" "1111" "1111" "1111" \ - "1111" "1111" "1111" "1111" \ - "1111" "1111" "1101" "1123" \ - "1111" "1111" "1145" "1167" -hpx::opencl::rect_props props04(0,0,0, 2,2,2, 2,2,2, 2,4, 4,16); +#define INITDATA \ + "0123" \ + "4567" \ + "8902" \ + "4680" \ + "1357" \ + "9152" \ + "6374" \ + "8597" \ + "9876" \ + "5432" \ + "1045" \ + "3627" \ + "1894" \ + "1928" \ + "3465" \ + "8709" + +#define TARGETDATA \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" + +#define REFDATA01 \ + "0123" \ + "4567" \ + "8902" \ + "4680" \ + "1357" \ + "9152" \ + "6374" \ + "8597" \ + "9876" \ + "5432" \ + "1045" \ + "3627" \ + "1894" \ + "1928" \ + "3465" \ + "8709" +hpx::opencl::rect_props props01(0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 16, 4, 16); + +#define REFDATA02 \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "4567" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" +hpx::opencl::rect_props props02(0, 1, 0, 0, 1, 2, 4, 1, 1, 4, 16, 4, 16); + +#define REFDATA03 \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1112" \ + "1114" \ + "1116" \ + "1118" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" +hpx::opencl::rect_props props03(2, 0, 0, 3, 0, 1, 1, 4, 1, 2, 0, 4, 16); + +#define REFDATA04 \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1111" \ + "1101" \ + "1123" \ + "1111" \ + "1111" \ + "1145" \ + "1167" +hpx::opencl::rect_props props04(0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 4, 4, 16); #define DATASIZE (sizeof(INITDATA)) - CREATE_BUFFER(initdata, INITDATA); CREATE_BUFFER(targetdata, TARGETDATA); CREATE_BUFFER(refdata01, REFDATA01); @@ -57,36 +131,31 @@ CREATE_BUFFER(refdata02, REFDATA02); CREATE_BUFFER(refdata03, REFDATA03); CREATE_BUFFER(refdata04, REFDATA04); -#define test_send(props, ref) { \ - buffer.enqueue_write(0, initdata).get(); \ - remote_buffer.enqueue_write(0, targetdata).get(); \ - auto data_send_future = buffer.enqueue_send_rect(remote_buffer, props); \ - auto data_read_future = remote_buffer.enqueue_read( 0, DATASIZE, \ - data_send_future.dst_future );\ - COMPARE_RESULT(data_read_future.get(), ref); \ +#define test_send(props, ref) \ + { \ + buffer.enqueue_write(0, initdata).get(); \ + remote_buffer.enqueue_write(0, targetdata).get(); \ + auto data_send_future = buffer.enqueue_send_rect(remote_buffer, props); \ + auto data_read_future = \ + remote_buffer.enqueue_read(0, DATASIZE, data_send_future.dst_future); \ + COMPARE_RESULT(data_read_future.get(), ref); \ + } + +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device remote_device) { + hpx::opencl::buffer buffer = + remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + hpx::opencl::buffer remote_buffer = + local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + // test if buffer initialization worked + size_t buffer_size = buffer.size().get(); + HPX_TEST_EQ(buffer_size, DATASIZE); + + // test if buffer can be read from to + test_send(props01, refdata01); + test_send(props02, refdata02); + test_send(props03, refdata03); + test_send(props04, refdata04); } - - -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device remote_device ) -{ - - hpx::opencl::buffer buffer = - remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - hpx::opencl::buffer remote_buffer = - local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - // test if buffer initialization worked - size_t buffer_size = buffer.size().get(); - HPX_TEST_EQ(buffer_size, DATASIZE); - - // test if buffer can be read from to - test_send(props01, refdata01); - test_send(props02, refdata02); - test_send(props03, refdata03); - test_send(props04, refdata04); - -} - - diff --git a/tests/unit/opencl/buffer_rect_write.cpp b/tests/unit/opencl/buffer_rect_write.cpp index 4eec9792..56fae52f 100644 --- a/tests/unit/opencl/buffer_rect_write.cpp +++ b/tests/unit/opencl/buffer_rect_write.cpp @@ -3,55 +3,128 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" - /* * This test is meant to verify the buffer rect write functionality. */ - // 3x3x3 cube of "0" -#define INITDATA "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" - -#define REFDATA01 "0000" "0000" "0000" "0000" \ - "0000" "0200" "0000" "0000" \ - "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" -hpx::opencl::rect_props props01(1,0,0, 1,1,1, 1,1,1, 0,0, 4,16); - -#define REFDATA02 "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" \ - "0000" "5678" "0000" "0000" \ - "0000" "0000" "0000" "0000" -hpx::opencl::rect_props props02(0,1,0, 0,1,2, 4,1,1, 4,0, 4,16); - -#define REFDATA03 "0000" "0000" "0000" "0000" \ - "0002" "0004" "0006" "0008" \ - "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" -hpx::opencl::rect_props props03(1,0,0, 3,0,1, 1,4,1, 2,0, 4,16); - -#define REFDATA04 "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" \ - "0000" "0000" "0012" "0034" \ - "0000" "0000" "0056" "0078" -hpx::opencl::rect_props props04(0,0,0, 2,2,2, 2,2,2, 2,4, 4,16); - -#define REFDATA05 "0000" "0000" "0000" "0000" \ - "0000" "0000" "0000" "0000" \ - "0000" "00DE" "0000" "0000" \ - "0000" "00ji" "0000" "0000" -hpx::opencl::rect_props props05(0,1,0, 1,1,2, 1,1,2, 1,2, 2, 8); - +#define INITDATA \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" + +#define REFDATA01 \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0200" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" +hpx::opencl::rect_props props01(1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 4, 16); + +#define REFDATA02 \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "5678" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" +hpx::opencl::rect_props props02(0, 1, 0, 0, 1, 2, 4, 1, 1, 4, 0, 4, 16); + +#define REFDATA03 \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0002" \ + "0004" \ + "0006" \ + "0008" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" +hpx::opencl::rect_props props03(1, 0, 0, 3, 0, 1, 1, 4, 1, 2, 0, 4, 16); + +#define REFDATA04 \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0012" \ + "0034" \ + "0000" \ + "0000" \ + "0056" \ + "0078" +hpx::opencl::rect_props props04(0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 4, 4, 16); + +#define REFDATA05 \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "0000" \ + "00DE" \ + "0000" \ + "0000" \ + "0000" \ + "00ji" \ + "0000" \ + "0000" +hpx::opencl::rect_props props05(0, 1, 0, 1, 1, 2, 1, 1, 2, 1, 2, 2, 8); #define DATASIZE (sizeof(INITDATA)) - CREATE_BUFFER(initdata, INITDATA); CREATE_BUFFER(refdata01, REFDATA01); CREATE_BUFFER(refdata02, REFDATA02); @@ -62,42 +135,35 @@ CREATE_BUFFER(moddata, "12345678"); static const uint16_t intarr[] = {0x4746, 0x4544, 0x6768, 0x696a}; static const int16buffer_type moddata2(intarr, 4, - int16buffer_type::init_mode::reference); - - -#define test_write(props, modifydata, ref) { \ - buffer.enqueue_write(0, initdata).get(); \ - auto data_write_future = buffer.enqueue_write_rect(props, modifydata); \ - auto data_read_future = buffer.enqueue_read( 0, DATASIZE, \ - data_write_future ); \ - COMPARE_RESULT(data_read_future.get(), ref); \ + int16buffer_type::init_mode::reference); + +#define test_write(props, modifydata, ref) \ + { \ + buffer.enqueue_write(0, initdata).get(); \ + auto data_write_future = buffer.enqueue_write_rect(props, modifydata); \ + auto data_read_future = \ + buffer.enqueue_read(0, DATASIZE, data_write_future); \ + COMPARE_RESULT(data_read_future.get(), ref); \ + } + +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device remote_device) { + hpx::opencl::buffer buffer = + remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + hpx::opencl::buffer buffer2 = + remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + hpx::opencl::buffer remote_buffer = + local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + // test if buffer initialization worked + size_t buffer_size = buffer.size().get(); + HPX_TEST_EQ(buffer_size, DATASIZE); + + // test if buffer can be written to + test_write(props01, moddata, refdata01); + test_write(props02, moddata, refdata02); + test_write(props03, moddata, refdata03); + test_write(props04, moddata, refdata04); + test_write(props05, moddata2, refdata05); } - - -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device remote_device ) -{ - - hpx::opencl::buffer buffer = - remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - hpx::opencl::buffer buffer2 = - remote_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - hpx::opencl::buffer remote_buffer = - local_device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - // test if buffer initialization worked - size_t buffer_size = buffer.size().get(); - HPX_TEST_EQ(buffer_size, DATASIZE); - - // test if buffer can be written to - test_write(props01, moddata, refdata01); - test_write(props02, moddata, refdata02); - test_write(props03, moddata, refdata03); - test_write(props04, moddata, refdata04); - test_write(props05, moddata2, refdata05); - - -} - - diff --git a/tests/unit/opencl/cl_tests.hpp b/tests/unit/opencl/cl_tests.hpp index 725586e8..8533b080 100644 --- a/tests/unit/opencl/cl_tests.hpp +++ b/tests/unit/opencl/cl_tests.hpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include #include #include @@ -13,9 +12,9 @@ #include -using boost::program_options::variables_map; using boost::program_options::options_description; using boost::program_options::value; +using boost::program_options::variables_map; // the main test function static void cl_test(hpx::opencl::device, hpx::opencl::device); @@ -25,145 +24,128 @@ static void cl_test(hpx::opencl::device, hpx::opencl::device); { \ std::shared_ptr> out1 = \ buffer.enqueue_read(0, DATASIZE).get().get_data().get();\ - HPX_TEST_EQ(std::string((const char*)(value)), std::string(out1->data())); \ + HPX_TEST_EQ(std::string((const char*)(value)), std::string(out1->data())); \ } */ -#define CREATE_BUFFER(name, data) \ - static const buffer_type name(data, sizeof(data), \ - buffer_type::init_mode::reference) - -#define COMPARE_RESULT_INT( result_data, correct_result ) \ -{ \ - auto lhs = result_data; \ - auto rhs = correct_result; \ - HPX_TEST_EQ(lhs.size(), rhs.size()); \ - for(std::size_t i = 0; i < lhs.size(); i++){ \ - std::cout << std::hex << lhs[i] << "-" << rhs[i] << std::endl; \ - HPX_TEST_EQ(lhs[i], rhs[i]); \ - } \ -} +#define CREATE_BUFFER(name, data) \ + static const buffer_type name(data, sizeof(data), \ + buffer_type::init_mode::reference) + +#define COMPARE_RESULT_INT(result_data, correct_result) \ + { \ + auto lhs = result_data; \ + auto rhs = correct_result; \ + HPX_TEST_EQ(lhs.size(), rhs.size()); \ + for (std::size_t i = 0; i < lhs.size(); i++) { \ + std::cout << std::hex << lhs[i] << "-" << rhs[i] << std::endl; \ + HPX_TEST_EQ(lhs[i], rhs[i]); \ + } \ + } typedef hpx::serialization::serialize_buffer buffer_type; typedef hpx::serialization::serialize_buffer intbuffer_type; typedef hpx::serialization::serialize_buffer int16buffer_type; -std::string to_string(buffer_type buf){ - std::size_t length = 0; - while(length < buf.size()) - { - if(buf[length] == '\0') break; - length++; - } - return std::string(buf.data(), buf.data() + length); +std::string to_string(buffer_type buf) { + std::size_t length = 0; + while (length < buf.size()) { + if (buf[length] == '\0') break; + length++; + } + return std::string(buf.data(), buf.data() + length); } -#define COMPARE_RESULT( result_data, correct_result ) \ -{ \ - auto lhs = result_data; \ - auto rhs = correct_result; \ - HPX_TEST_EQ(lhs.size(), rhs.size()); \ - std::string correct_string = to_string(rhs); \ - std::string result_string = to_string(lhs); \ - HPX_TEST_EQ(correct_string, result_string); \ -} - - -static void print_testdevice_info(hpx::opencl::device & cldevice, +#define COMPARE_RESULT(result_data, correct_result) \ + { \ + auto lhs = result_data; \ + auto rhs = correct_result; \ + HPX_TEST_EQ(lhs.size(), rhs.size()); \ + std::string correct_string = to_string(rhs); \ + std::string result_string = to_string(lhs); \ + HPX_TEST_EQ(correct_string, result_string); \ + } + +static void print_testdevice_info(hpx::opencl::device& cldevice, std::size_t device_id, - std::size_t num_devices){ - - // Test whether get_device_info works - std::string version = cldevice.get_device_info().get(); - - // Test whether version is a valid OpenCL version string - std::string versionstring = std::string("OpenCL "); - HPX_TEST(0 == version.compare(0, versionstring.length(), versionstring)); - - // Write Info Code - hpx::cout << "Device ID: " << device_id << " / " << num_devices - << hpx::endl; - hpx::cout << "Device GID: " << cldevice.get_id() << hpx::endl; - hpx::cout << "Version: " << version << hpx::endl; - hpx::cout << "Name: " << cldevice.get_device_info().get() - << hpx::endl; - hpx::cout << "Vendor: " << cldevice.get_device_info().get() - << hpx::endl; - hpx::cout << "Profile: " << cldevice.get_device_info().get() - << hpx::endl; - - // Test for valid device client - HPX_TEST(cldevice.get_id()); - - + std::size_t num_devices) { + // Test whether get_device_info works + std::string version = cldevice.get_device_info().get(); + + // Test whether version is a valid OpenCL version string + std::string versionstring = std::string("OpenCL "); + HPX_TEST(0 == version.compare(0, versionstring.length(), versionstring)); + + // Write Info Code + hpx::cout << "Device ID: " << device_id << " / " << num_devices << hpx::endl; + hpx::cout << "Device GID: " << cldevice.get_id() << hpx::endl; + hpx::cout << "Version: " << version << hpx::endl; + hpx::cout << "Name: " + << cldevice.get_device_info().get() << hpx::endl; + hpx::cout << "Vendor: " + << cldevice.get_device_info().get() << hpx::endl; + hpx::cout << "Profile: " + << cldevice.get_device_info().get() << hpx::endl; + + // Test for valid device client + HPX_TEST(cldevice.get_id()); } -static std::vector init(variables_map & vm) -{ - - std::size_t device_id = 0; - - if (vm.count("deviceid")) - device_id = vm["deviceid"].as(); - - // Try to get remote devices - std::vector remote_devices - = hpx::opencl::create_remote_devices( CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); - std::vector local_devices - = hpx::opencl::create_local_devices( CL_DEVICE_TYPE_ALL, - "OpenCL 1.1" ).get(); - // If no remote devices present, get local device - if(remote_devices.empty()){ - hpx::cout << "WARNING: No remote devices found." << hpx::endl; - remote_devices = local_devices; - } - HPX_ASSERT(!remote_devices.empty()); - HPX_ASSERT(!local_devices.empty()); - HPX_TEST(local_devices.size() > device_id); - HPX_TEST(remote_devices.size() > device_id); - - // Choose device - hpx::opencl::device local_device = local_devices[device_id]; - hpx::opencl::device remote_device = remote_devices[device_id]; - - // Print info - hpx::cout << "Local device:" << hpx::endl; - print_testdevice_info(local_device, device_id, local_devices.size()); - hpx::cout << "Remote device:" << hpx::endl; - print_testdevice_info(remote_device, device_id, remote_devices.size()); - - // return the devices - std::vector devices; - devices.push_back(local_device); - devices.push_back(remote_device); - return devices; - +static std::vector init(variables_map& vm) { + std::size_t device_id = 0; + + if (vm.count("deviceid")) device_id = vm["deviceid"].as(); + + // Try to get remote devices + std::vector remote_devices = + hpx::opencl::create_remote_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1") + .get(); + std::vector local_devices = + hpx::opencl::create_local_devices(CL_DEVICE_TYPE_ALL, "OpenCL 1.1").get(); + // If no remote devices present, get local device + if (remote_devices.empty()) { + hpx::cout << "WARNING: No remote devices found." << hpx::endl; + remote_devices = local_devices; + } + HPX_ASSERT(!remote_devices.empty()); + HPX_ASSERT(!local_devices.empty()); + HPX_TEST(local_devices.size() > device_id); + HPX_TEST(remote_devices.size() > device_id); + + // Choose device + hpx::opencl::device local_device = local_devices[device_id]; + hpx::opencl::device remote_device = remote_devices[device_id]; + + // Print info + hpx::cout << "Local device:" << hpx::endl; + print_testdevice_info(local_device, device_id, local_devices.size()); + hpx::cout << "Remote device:" << hpx::endl; + print_testdevice_info(remote_device, device_id, remote_devices.size()); + + // return the devices + std::vector devices; + devices.push_back(local_device); + devices.push_back(remote_device); + return devices; } -int hpx_main(variables_map & vm) -{ - { - auto devices = init(vm); - hpx::cout << hpx::endl; - cl_test(devices[0], devices[1]); - } +int hpx_main(variables_map& vm) { + { + auto devices = init(vm); + hpx::cout << hpx::endl; + cl_test(devices[0], devices[1]); + } - hpx::finalize(); - return hpx::util::report_errors(); + hpx::finalize(); + return hpx::util::report_errors(); } - - /////////////////////////////////////////////////////////////////////////////// -int main(int argc, char* argv[]) -{ - // Configure application-specific options - options_description cmdline("Usage: " HPX_APPLICATION_STRING " [options]"); - cmdline.add_options() - ( "deviceid" - , value()->default_value(0) - , "the ID of the device we will run our tests on") ; - - return hpx::init(cmdline, argc, argv); +int main(int argc, char* argv[]) { + // Configure application-specific options + options_description cmdline("Usage: " HPX_APPLICATION_STRING " [options]"); + cmdline.add_options()("deviceid", value()->default_value(0), + "the ID of the device we will run our tests on"); + + return hpx::init(cmdline, argc, argv); } diff --git a/tests/unit/opencl/data_map.cpp b/tests/unit/opencl/data_map.cpp index ce8102d1..a8e81b38 100644 --- a/tests/unit/opencl/data_map.cpp +++ b/tests/unit/opencl/data_map.cpp @@ -7,62 +7,58 @@ #include "../../../opencl/server/util/data_map.hpp" -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device cldevice ) -{ - typedef hpx::serialization::serialize_buffer buffer_type; +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device cldevice) { + typedef hpx::serialization::serialize_buffer buffer_type; - // Create a data_map - hpx::opencl::server::util::data_map map; + // Create a data_map + hpx::opencl::server::util::data_map map; - // Create a promise - hpx::promise promise; + // Create a promise + hpx::promise promise; - // Create a future - auto future = promise.get_future(); + // Create a future + auto future = promise.get_future(); - // Create a cl_event - cl_event event = (cl_event)5; + // Create a cl_event + cl_event event = (cl_event)5; - // Make sure the promise did not get triggered yet - hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - HPX_TEST(!future.is_ready()); + // Make sure the promise did not get triggered yet + hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); + HPX_TEST(!future.is_ready()); - { - // Create some data - buffer_type buffer("Test", sizeof("Test"), buffer_type::init_mode::copy); + { + // Create some data + buffer_type buffer("Test", sizeof("Test"), buffer_type::init_mode::copy); - // Make sure the data isn't registered yet - HPX_TEST(!map.has_data(event)); + // Make sure the data isn't registered yet + HPX_TEST(!map.has_data(event)); - // Register buffer in map - map.add(event, buffer); + // Register buffer in map + map.add(event, buffer); - // Make sure the data is now registered - HPX_TEST(map.has_data(event)); + // Make sure the data is now registered + HPX_TEST(map.has_data(event)); - // Deallocate the buffer(out of scope). - // Should get kept alive by the map. - } + // Deallocate the buffer(out of scope). + // Should get kept alive by the map. + } - // Make sure the promise did not get triggered yet - hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - HPX_TEST(!future.is_ready()); + // Make sure the promise did not get triggered yet + hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); + HPX_TEST(!future.is_ready()); - // Trigger the promise - map.get(event).send_data_to_client(promise.get_id()); + // Trigger the promise + map.get(event).send_data_to_client(promise.get_id()); - // Make sure the promise got triggered - hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - HPX_TEST(future.is_ready()); + // Make sure the promise got triggered + hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); + HPX_TEST(future.is_ready()); - // Make sure the data is correct - auto data = future.get(); - HPX_TEST( strcmp(data.data(), "Test") == 0 ); - - // Take the data out of the map - map.remove(event); + // Make sure the data is correct + auto data = future.get(); + HPX_TEST(strcmp(data.data(), "Test") == 0); + // Take the data out of the map + map.remove(event); } - - diff --git a/tests/unit/opencl/dynamic_overloads.cpp b/tests/unit/opencl/dynamic_overloads.cpp index 2c3447fb..aec34f24 100644 --- a/tests/unit/opencl/dynamic_overloads.cpp +++ b/tests/unit/opencl/dynamic_overloads.cpp @@ -10,64 +10,59 @@ #include "register_event.hpp" -class test_client{ - public: - template - hpx::future func(hpx::naming::gid_type device_id,int a, int b, Deps &&... dependencies ) - { - // combine dependency futures in one std::vector - using hpx::opencl::util::enqueue_overloads::resolver; - auto deps = resolver(device_id,std::forward(dependencies)...); - - return func_impl( std::move(a), std::move(b), std::move(deps) ); - } - - hpx::future func_impl( int && a, int && b, - hpx::opencl::util::resolved_events && ids ); +class test_client { + public: + template + hpx::future func(hpx::naming::gid_type device_id, int a, int b, + Deps &&...dependencies) { + // combine dependency futures in one std::vector + using hpx::opencl::util::enqueue_overloads::resolver; + auto deps = resolver(device_id, std::forward(dependencies)...); + + return func_impl(std::move(a), std::move(b), std::move(deps)); + } + + hpx::future func_impl(int &&a, int &&b, + hpx::opencl::util::resolved_events &&ids); }; - -hpx::future -test_client::func_impl( int && a, int && b, - hpx::opencl::util::resolved_events && ids){ - return hpx::make_ready_future((int)ids.event_ids.size() + 1000 * a + 100 * b); +hpx::future test_client::func_impl( + int &&a, int &&b, hpx::opencl::util::resolved_events &&ids) { + return hpx::make_ready_future((int)ids.event_ids.size() + 1000 * a + + 100 * b); }; -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device cldevice ){ - - hpx::opencl::lcos::event event(local_device.get_id()); - register_event(local_device, event.get_event_id()); - - hpx::shared_future sfut = event.get_future(); - std::vector> vsfut1; - vsfut1.push_back( sfut ); - std::vector> vsfut2; - vsfut2.push_back( sfut ); - vsfut2.push_back( sfut ); - - test_client t; - - hpx::naming::gid_type device_gid = cldevice.get_id().get_gid(); - - std::cout << cldevice.get_id().get_gid() << std::endl; - - HPX_TEST_EQ( 5300, t.func(device_gid,5, 3 ).get() ); - HPX_TEST_EQ( 5301, t.func(device_gid,5, 3, sfut ).get() ); - HPX_TEST_EQ( 5302, t.func(device_gid,5, 3, sfut, sfut ).get() ); - HPX_TEST_EQ( 5301, t.func(device_gid,5, 3, vsfut1 ).get() ); - HPX_TEST_EQ( 5302, t.func(device_gid,5, 3, vsfut2 ).get() ); - HPX_TEST_EQ( 5302, t.func(device_gid,5, 3, vsfut1, vsfut1 ).get() ); - HPX_TEST_EQ( 5303, t.func(device_gid,5, 3, vsfut1, vsfut2 ).get() ); - HPX_TEST_EQ( 5303, t.func(device_gid,5, 3, vsfut2, vsfut1 ).get() ); - HPX_TEST_EQ( 5304, t.func(device_gid,5, 3, vsfut2, vsfut2 ).get() ); - HPX_TEST_EQ( 5304, t.func(device_gid,5, 3, sfut, sfut, vsfut2 ).get() ); - HPX_TEST_EQ( 5304, t.func(device_gid,5, 3, sfut, vsfut2, sfut ).get() ); - HPX_TEST_EQ( 5304, t.func(device_gid,5, 3, vsfut2, sfut, sfut ).get() ); - HPX_TEST_EQ( 5305, t.func(device_gid,5, 3, sfut, vsfut2, vsfut2 ).get() ); - HPX_TEST_EQ( 5305, t.func(device_gid,5, 3, vsfut2, sfut, vsfut2 ).get() ); - HPX_TEST_EQ( 5305, t.func(device_gid,5, 3, vsfut2, vsfut2, sfut ).get() ); - - +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device cldevice) { + hpx::opencl::lcos::event event(local_device.get_id()); + register_event(local_device, event.get_event_id()); + + hpx::shared_future sfut = event.get_future(); + std::vector> vsfut1; + vsfut1.push_back(sfut); + std::vector> vsfut2; + vsfut2.push_back(sfut); + vsfut2.push_back(sfut); + + test_client t; + + hpx::naming::gid_type device_gid = cldevice.get_id().get_gid(); + + std::cout << cldevice.get_id().get_gid() << std::endl; + + HPX_TEST_EQ(5300, t.func(device_gid, 5, 3).get()); + HPX_TEST_EQ(5301, t.func(device_gid, 5, 3, sfut).get()); + HPX_TEST_EQ(5302, t.func(device_gid, 5, 3, sfut, sfut).get()); + HPX_TEST_EQ(5301, t.func(device_gid, 5, 3, vsfut1).get()); + HPX_TEST_EQ(5302, t.func(device_gid, 5, 3, vsfut2).get()); + HPX_TEST_EQ(5302, t.func(device_gid, 5, 3, vsfut1, vsfut1).get()); + HPX_TEST_EQ(5303, t.func(device_gid, 5, 3, vsfut1, vsfut2).get()); + HPX_TEST_EQ(5303, t.func(device_gid, 5, 3, vsfut2, vsfut1).get()); + HPX_TEST_EQ(5304, t.func(device_gid, 5, 3, vsfut2, vsfut2).get()); + HPX_TEST_EQ(5304, t.func(device_gid, 5, 3, sfut, sfut, vsfut2).get()); + HPX_TEST_EQ(5304, t.func(device_gid, 5, 3, sfut, vsfut2, sfut).get()); + HPX_TEST_EQ(5304, t.func(device_gid, 5, 3, vsfut2, sfut, sfut).get()); + HPX_TEST_EQ(5305, t.func(device_gid, 5, 3, sfut, vsfut2, vsfut2).get()); + HPX_TEST_EQ(5305, t.func(device_gid, 5, 3, vsfut2, sfut, vsfut2).get()); + HPX_TEST_EQ(5305, t.func(device_gid, 5, 3, vsfut2, vsfut2, sfut).get()); }; - diff --git a/tests/unit/opencl/event.cpp b/tests/unit/opencl/event.cpp index 61cc5d62..8c461bb0 100644 --- a/tests/unit/opencl/event.cpp +++ b/tests/unit/opencl/event.cpp @@ -6,28 +6,28 @@ #include "cl_tests.hpp" #include "register_event.hpp" -void cl_test( hpx::opencl::device cldevice, hpx::opencl::device ) -{ - typedef hpx::opencl::lcos::event event_type; - typedef typename event_type::shared_state_type shared_state_type; +void cl_test(hpx::opencl::device cldevice, hpx::opencl::device) { + typedef hpx::opencl::lcos::event event_type; + typedef typename event_type::shared_state_type shared_state_type; - event_type event(cldevice.get_id()); + event_type event(cldevice.get_id()); - auto future = event.get_future(); - auto future_data = hpx::traits::detail::get_shared_state(future); - auto shared_state = boost::static_pointer_cast(future_data); + auto future = event.get_future(); + auto future_data = hpx::traits::detail::get_shared_state(future); + auto shared_state = + boost::static_pointer_cast(future_data); - auto gid2 = shared_state->get_event_id(); - auto gid = event.get_event_id(); + auto gid2 = shared_state->get_event_id(); + auto gid = event.get_event_id(); - register_event(cldevice, gid); + register_event(cldevice, gid); - HPX_TEST_EQ(gid, gid2); + HPX_TEST_EQ(gid, gid2); - future.wait(); + future.wait(); - hpx::this_thread::sleep_for(std::chrono::milliseconds(10)); + hpx::this_thread::sleep_for(std::chrono::milliseconds(10)); - hpx::cout << gid << hpx::endl; - hpx::cout << gid2 << hpx::endl; + hpx::cout << gid << hpx::endl; + hpx::cout << gid2 << hpx::endl; } diff --git a/tests/unit/opencl/event_map.cpp b/tests/unit/opencl/event_map.cpp index 1e622735..6129bd01 100644 --- a/tests/unit/opencl/event_map.cpp +++ b/tests/unit/opencl/event_map.cpp @@ -11,102 +11,90 @@ using hpx::naming::id_type; - static hpx::opencl::server::util::event_map *map; static std::atomic id_counter(1); -static id_type next_id(){ - return id_type(0, id_counter++, id_type::management_type::unmanaged); +static id_type next_id() { + return id_type(0, id_counter++, id_type::management_type::unmanaged); } static std::atomic num_deleted; -static void deletion_callback(cl_event e){ - num_deleted++; - hpx::cout << "deletion_callback: " << (std::size_t) e << hpx::endl; +static void deletion_callback(cl_event e) { + num_deleted++; + hpx::cout << "deletion_callback: " << (std::size_t)e << hpx::endl; } -static std::size_t count_deleted(){ - return num_deleted.exchange(0); -} +static std::size_t count_deleted() { return num_deleted.exchange(0); } -hpx::future get_async(id_type id){ - return hpx::async( - [id](){ - return map->get(id); - }); +hpx::future get_async(id_type id) { + return hpx::async([id]() { return map->get(id); }); } -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device cldevice ) -{ - - // Usually: do not use new, use make_shared<>. But in this case, - // we also want to test the shutdown routine and therefore need - // explicit deletion - map = new hpx::opencl::server::util::event_map(); - - // Register the desctruction callback - map->register_deletion_callback(&deletion_callback); - - // Test default functionality - { - id_type id = next_id(); - cl_event event = (cl_event)id.get_lsb(); - - map->add(id, event); - - HPX_TEST_EQ(map->get(id), event); - - HPX_TEST_EQ(count_deleted(), 0); - map->remove(id.get_gid()); - HPX_TEST_EQ(count_deleted(), 1); - } - - // Test reverse get functionality - { - id_type id = next_id(); - cl_event event = (cl_event)id.get_lsb(); - id_type id2 = next_id(); - cl_event event2 = (cl_event)id2.get_lsb(); - - // Run asynchronous thread - hpx::future thread1_1 = get_async(id); - hpx::future thread1_2 = get_async(id); - hpx::future thread2_1 = get_async(id2); - - hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - HPX_TEST(!thread1_1.is_ready()); - HPX_TEST(!thread1_2.is_ready()); - HPX_TEST(!thread2_1.is_ready()); - - map->add(id2, event2); - - hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - HPX_TEST(!thread1_1.is_ready()); - HPX_TEST(!thread1_2.is_ready()); - HPX_TEST(thread2_1.is_ready()); - - map->add(id, event); - - hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); - HPX_TEST(thread1_1.is_ready()); - HPX_TEST(thread1_2.is_ready()); - HPX_TEST(thread2_1.is_ready()); - - HPX_TEST_EQ(thread1_1.get(), event); - HPX_TEST_EQ(thread1_2.get(), event); - HPX_TEST_EQ(thread2_1.get(), event2); - - HPX_TEST_EQ(count_deleted(), 0); - map->remove(id.get_gid()); - map->remove(id2.get_gid()); - HPX_TEST_EQ(count_deleted(), 2); - } - - - // Test deletion. - delete map; - +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device cldevice) { + // Usually: do not use new, use make_shared<>. But in this case, + // we also want to test the shutdown routine and therefore need + // explicit deletion + map = new hpx::opencl::server::util::event_map(); + + // Register the desctruction callback + map->register_deletion_callback(&deletion_callback); + + // Test default functionality + { + id_type id = next_id(); + cl_event event = (cl_event)id.get_lsb(); + + map->add(id, event); + + HPX_TEST_EQ(map->get(id), event); + + HPX_TEST_EQ(count_deleted(), 0); + map->remove(id.get_gid()); + HPX_TEST_EQ(count_deleted(), 1); + } + + // Test reverse get functionality + { + id_type id = next_id(); + cl_event event = (cl_event)id.get_lsb(); + id_type id2 = next_id(); + cl_event event2 = (cl_event)id2.get_lsb(); + + // Run asynchronous thread + hpx::future thread1_1 = get_async(id); + hpx::future thread1_2 = get_async(id); + hpx::future thread2_1 = get_async(id2); + + hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); + HPX_TEST(!thread1_1.is_ready()); + HPX_TEST(!thread1_2.is_ready()); + HPX_TEST(!thread2_1.is_ready()); + + map->add(id2, event2); + + hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); + HPX_TEST(!thread1_1.is_ready()); + HPX_TEST(!thread1_2.is_ready()); + HPX_TEST(thread2_1.is_ready()); + + map->add(id, event); + + hpx::this_thread::sleep_for(std::chrono::milliseconds(100)); + HPX_TEST(thread1_1.is_ready()); + HPX_TEST(thread1_2.is_ready()); + HPX_TEST(thread2_1.is_ready()); + + HPX_TEST_EQ(thread1_1.get(), event); + HPX_TEST_EQ(thread1_2.get(), event); + HPX_TEST_EQ(thread2_1.get(), event2); + + HPX_TEST_EQ(count_deleted(), 0); + map->remove(id.get_gid()); + map->remove(id2.get_gid()); + HPX_TEST_EQ(count_deleted(), 2); + } + + // Test deletion. + delete map; } - - diff --git a/tests/unit/opencl/info.cpp b/tests/unit/opencl/info.cpp index 150fa5d2..9248d86a 100644 --- a/tests/unit/opencl/info.cpp +++ b/tests/unit/opencl/info.cpp @@ -3,39 +3,30 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device cldevice) { + //////////////////////////////////////////////////////////////////////////// + // Test if cast to string and cast to vector returns + // identical results + // -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device cldevice ) -{ - - //////////////////////////////////////////////////////////////////////////// - // Test if cast to string and cast to vector returns - // identical results - // - - std::string device_version = - cldevice.get_device_info().get(); - - HPX_TEST(device_version.find("OpenCL ") == 0); - + std::string device_version = + cldevice.get_device_info().get(); - //////////////////////////////////////////////////////////////////////////// - // Test if CL_DEVICE_MAX_WORK_ITEM_SIZES-Array returns as many items as - // specified by OpenCL - // + HPX_TEST(device_version.find("OpenCL ") == 0); - cl_uint work_dims = - cldevice.get_device_info().get(); + //////////////////////////////////////////////////////////////////////////// + // Test if CL_DEVICE_MAX_WORK_ITEM_SIZES-Array returns as many items as + // specified by OpenCL + // - std::vector work_items = - cldevice.get_device_info().get(); - - HPX_TEST_EQ(work_items.size(), work_dims); + cl_uint work_dims = + cldevice.get_device_info().get(); + std::vector work_items = + cldevice.get_device_info().get(); + HPX_TEST_EQ(work_items.size(), work_dims); } - - diff --git a/tests/unit/opencl/initialization.cpp b/tests/unit/opencl/initialization.cpp index 2a99ac38..c7829215 100644 --- a/tests/unit/opencl/initialization.cpp +++ b/tests/unit/opencl/initialization.cpp @@ -3,13 +3,9 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" static void cl_test(hpx::opencl::device local_device, - hpx::opencl::device remote_device) -{ - // do nothing. this file only tests the initialization + hpx::opencl::device remote_device) { + // do nothing. this file only tests the initialization } - - diff --git a/tests/unit/opencl/kernel.cpp b/tests/unit/opencl/kernel.cpp index 678267f0..7c7b780a 100644 --- a/tests/unit/opencl/kernel.cpp +++ b/tests/unit/opencl/kernel.cpp @@ -3,216 +3,207 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" - /* * This test is meant to verify the kernel creation and execution functionality. */ CREATE_BUFFER(invalid_program_src, -" \n" -" __kernel void hello_world(__global char * in, __global char * out) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = (char)(in[unknown_variable] + tid); \n" -" } \n" -" \n"); + " " + " \n" + " __kernel void hello_world(__global char * in, __global char " + "* out) \n" + " { " + " \n" + " size_t tid = get_global_id(0); " + " \n" + " out[tid] = (char)(in[unknown_variable] + tid); " + " \n" + " } " + " \n" + " " + " \n"); CREATE_BUFFER(program_src, -" \n" -" __kernel void hello_world(__global char * in, __global char * out) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = (char)(in[tid] + tid); \n" -" } \n" -" \n"); - - + " " + " \n" + " __kernel void hello_world(__global char * in, __global char " + "* out) \n" + " { " + " \n" + " size_t tid = get_global_id(0); " + " \n" + " out[tid] = (char)(in[tid] + tid); " + " \n" + " } " + " \n" + " " + " \n"); #define DATASIZE (sizeof("Hello, World!")) -const char initdata_arr[] = { ('H' - static_cast( 0)), - ('e' - static_cast( 1)), - ('l' - static_cast( 2)), - ('l' - static_cast( 3)), - ('o' - static_cast( 4)), - (',' - static_cast( 5)), - (' ' - static_cast( 6)), - ('W' - static_cast( 7)), - ('o' - static_cast( 8)), - ('r' - static_cast( 9)), - ('l' - static_cast(10)), - ('d' - static_cast(11)), - ('!' - static_cast(12)), - ('\0' - static_cast(13)) }; -const char refdata2_arr[] = { ('H' + static_cast( 0)), - ('e' + static_cast( 1)), - ('l' + static_cast( 2)), - ('l' + static_cast( 3)), - ('o' + static_cast( 4)), - (',' + static_cast( 5)), - (' ' + static_cast( 6)), - ('W' + static_cast( 7)), - ('o' + static_cast( 8)), - ('r' + static_cast( 9)), - ('l' + static_cast(10)), - ('d' + static_cast(11)), - ('!' + static_cast(12)), - ('\0' + static_cast(13)) }; +const char initdata_arr[] = { + ('H' - static_cast(0)), ('e' - static_cast(1)), + ('l' - static_cast(2)), ('l' - static_cast(3)), + ('o' - static_cast(4)), (',' - static_cast(5)), + (' ' - static_cast(6)), ('W' - static_cast(7)), + ('o' - static_cast(8)), ('r' - static_cast(9)), + ('l' - static_cast(10)), ('d' - static_cast(11)), + ('!' - static_cast(12)), ('\0' - static_cast(13))}; +const char refdata2_arr[] = { + ('H' + static_cast(0)), ('e' + static_cast(1)), + ('l' + static_cast(2)), ('l' + static_cast(3)), + ('o' + static_cast(4)), (',' + static_cast(5)), + (' ' + static_cast(6)), ('W' + static_cast(7)), + ('o' + static_cast(8)), ('r' + static_cast(9)), + ('l' + static_cast(10)), ('d' + static_cast(11)), + ('!' + static_cast(12)), ('\0' + static_cast(13))}; CREATE_BUFFER(initdata, initdata_arr); CREATE_BUFFER(refdata1, "Hello, World!"); CREATE_BUFFER(refdata2, refdata2_arr); -static void create_and_run_kernel( hpx::opencl::device cldevice, - hpx::opencl::program program ){ - - // test if kernel can be created - hpx::opencl::kernel kernel = program.create_kernel("hello_world"); - - // test if creation of invalid kernels throws - { - bool caught_exception = false; - try{ - hpx::opencl::kernel kernel = program.create_kernel("blub"); - kernel.get_id(); - } catch (hpx::exception e){ - caught_exception = true; - } - HPX_ASSERT(caught_exception); - } - - // create source and destination buffers - hpx::opencl::buffer buffer_src = - cldevice.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - hpx::opencl::buffer buffer_dst = - cldevice.create_buffer(CL_MEM_READ_WRITE, DATASIZE); - - // test if buffer initialization worked - { - size_t buffer_src_size = buffer_src.size().get(); - HPX_TEST_EQ(buffer_src_size, DATASIZE); - size_t buffer_dst_size = buffer_dst.size().get(); - HPX_TEST_EQ(buffer_dst_size, DATASIZE); - } - - - // set kernel arguments - { - auto future1 = kernel.set_arg_async(0, buffer_src); - kernel.set_arg(1, buffer_dst); - future1.get(); - } - - // set work dimensions - hpx::opencl::work_size<1> size; - size[0].offset = 0; - size[0].size = DATASIZE; - - // test if kernel can get executed (blocking) - { - // Initialize src buffer - buffer_src.enqueue_write(0, initdata).get(); - - // Execute - kernel.enqueue(size).get(); - - // Check for correct result - auto result_future = buffer_dst.enqueue_read(0, DATASIZE); - COMPARE_RESULT(result_future.get(), refdata1); +static void create_and_run_kernel(hpx::opencl::device cldevice, + hpx::opencl::program program) { + // test if kernel can be created + hpx::opencl::kernel kernel = program.create_kernel("hello_world"); + + // test if creation of invalid kernels throws + { + bool caught_exception = false; + try { + hpx::opencl::kernel kernel = program.create_kernel("blub"); + kernel.get_id(); + } catch (hpx::exception e) { + caught_exception = true; } - - // test if kernel can get executed (non-blocking) - { - // Send result of blocking execution to src buffer - auto fut1 = buffer_dst.enqueue_send(buffer_src, 0, 0, DATASIZE); - - // Execute - auto fut2 = kernel.enqueue(size, fut1.dst_future); - - // Read data - auto result_future = buffer_dst.enqueue_read(0, DATASIZE, fut2); - COMPARE_RESULT(result_future.get(), refdata2); - } - + HPX_ASSERT(caught_exception); + } + + // create source and destination buffers + hpx::opencl::buffer buffer_src = + cldevice.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + hpx::opencl::buffer buffer_dst = + cldevice.create_buffer(CL_MEM_READ_WRITE, DATASIZE); + + // test if buffer initialization worked + { + size_t buffer_src_size = buffer_src.size().get(); + HPX_TEST_EQ(buffer_src_size, DATASIZE); + size_t buffer_dst_size = buffer_dst.size().get(); + HPX_TEST_EQ(buffer_dst_size, DATASIZE); + } + + // set kernel arguments + { + auto future1 = kernel.set_arg_async(0, buffer_src); + kernel.set_arg(1, buffer_dst); + future1.get(); + } + + // set work dimensions + hpx::opencl::work_size<1> size; + size[0].offset = 0; + size[0].size = DATASIZE; + + // test if kernel can get executed (blocking) + { + // Initialize src buffer + buffer_src.enqueue_write(0, initdata).get(); + + // Execute + kernel.enqueue(size).get(); + + // Check for correct result + auto result_future = buffer_dst.enqueue_read(0, DATASIZE); + COMPARE_RESULT(result_future.get(), refdata1); + } + + // test if kernel can get executed (non-blocking) + { + // Send result of blocking execution to src buffer + auto fut1 = buffer_dst.enqueue_send(buffer_src, 0, 0, DATASIZE); + + // Execute + auto fut2 = kernel.enqueue(size, fut1.dst_future); + + // Read data + auto result_future = buffer_dst.enqueue_read(0, DATASIZE, fut2); + COMPARE_RESULT(result_future.get(), refdata2); + } } -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device cldevice ) -{ - - // standard hello-world test - { - // test if program can be created from source - hpx::opencl::program program = - cldevice.create_program_with_source(program_src); - - // test if program can be compiled - // IMPORTANT! use get(). wait() does not throw errors. - program.build_async().get(); - - // test if program can be used for computation - create_and_run_kernel(cldevice, program); +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device cldevice) { + // standard hello-world test + { + // test if program can be created from source + hpx::opencl::program program = + cldevice.create_program_with_source(program_src); + + // test if program can be compiled + // IMPORTANT! use get(). wait() does not throw errors. + program.build_async().get(); + + // test if program can be used for computation + create_and_run_kernel(cldevice, program); + } + + // same test with build arguments + { + // test if program can be created from source + hpx::opencl::program program = + cldevice.create_program_with_source(program_src); + + // test if program can be compiled + program.build_async("-cl-std=CL1.1").get(); + + // test if program can be used for computation + create_and_run_kernel(cldevice, program); + } + + // test with create_from_binary + { + // test if program can be created from source + hpx::opencl::program program1 = + cldevice.create_program_with_source(program_src); + + // test if program can be compiled + program1.build_async().get(); + + // retrieve binary of program1 + auto program_binary = program1.get_binary().get(); + + hpx::cout << "Binary:" << hpx::endl; + hpx::cout << to_string(program_binary) << hpx::endl << hpx::endl; + ; + + // test if program can be created from binary + hpx::opencl::program program2 = + cldevice.create_program_with_binary(program_binary); + + // test if program can be compiled + program2.build(); + + // test if program can be used for computation + create_and_run_kernel(cldevice, program2); + } + + // Test compiler error detection + { + // create program from source. this should not throw + hpx::opencl::program program = + cldevice.create_program_with_source(invalid_program_src); + + // Try to build. This should throw an error. + bool caught_exception = false; + try { + program.build_async().get(); + } catch (hpx::exception e) { + hpx::cout << "Build error:" << hpx::endl; + hpx::cout << e.what() << hpx::endl << hpx::endl; + caught_exception = true; } - - // same test with build arguments - { - // test if program can be created from source - hpx::opencl::program program = - cldevice.create_program_with_source(program_src); - - // test if program can be compiled - program.build_async("-cl-std=CL1.1").get(); - - // test if program can be used for computation - create_and_run_kernel(cldevice, program); - } - - // test with create_from_binary - { - // test if program can be created from source - hpx::opencl::program program1 = - cldevice.create_program_with_source(program_src); - - // test if program can be compiled - program1.build_async().get(); - - // retrieve binary of program1 - auto program_binary = program1.get_binary().get(); - - hpx::cout << "Binary:" << hpx::endl; - hpx::cout << to_string(program_binary) << hpx::endl << hpx::endl;; - - // test if program can be created from binary - hpx::opencl::program program2 = - cldevice.create_program_with_binary(program_binary); - - // test if program can be compiled - program2.build(); - - // test if program can be used for computation - create_and_run_kernel(cldevice, program2); - } - - // Test compiler error detection - { - // create program from source. this should not throw - hpx::opencl::program program = - cldevice.create_program_with_source(invalid_program_src); - - // Try to build. This should throw an error. - bool caught_exception = false; - try{ - program.build_async().get(); - } catch (hpx::exception e){ - hpx::cout << "Build error:" << hpx::endl; - hpx::cout << e.what() << hpx::endl << hpx::endl; - caught_exception = true; - } - HPX_TEST(caught_exception); - } - + HPX_TEST(caught_exception); + } } - - diff --git a/tests/unit/opencl/register_event.hpp b/tests/unit/opencl/register_event.hpp index 367400f6..17bd3e6f 100644 --- a/tests/unit/opencl/register_event.hpp +++ b/tests/unit/opencl/register_event.hpp @@ -3,7 +3,6 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include #include @@ -13,25 +12,18 @@ #include "../../../opencl/server/device.hpp" #include "../../../opencl/tools.hpp" - - static void register_event(hpx::opencl::device cldevice, - const hpx::naming::id_type & event_id) -{ - std::shared_ptr - parent_device = hpx::get_ptr - (cldevice.get_id()).get(); + const hpx::naming::id_type& event_id) { + std::shared_ptr parent_device = + hpx::get_ptr(cldevice.get_id()).get(); - // create a fake event - cl_int err; - cl_event event_cl = clCreateUserEvent ( - parent_device->get_context(), - &err); - cl_ensure(err, "clCreateUserEvent()"); + // create a fake event + cl_int err; + cl_event event_cl = clCreateUserEvent(parent_device->get_context(), &err); + cl_ensure(err, "clCreateUserEvent()"); - err = clSetUserEventStatus(event_cl, CL_COMPLETE); - cl_ensure(err, "clSetUserEventStatus()"); + err = clSetUserEventStatus(event_cl, CL_COMPLETE); + cl_ensure(err, "clSetUserEventStatus()"); - parent_device->register_event(event_id, event_cl); + parent_device->register_event(event_id, event_cl); } - diff --git a/tests/unit/opencl/serialize.cpp b/tests/unit/opencl/serialize.cpp index 93b48a12..63e879f5 100644 --- a/tests/unit/opencl/serialize.cpp +++ b/tests/unit/opencl/serialize.cpp @@ -3,171 +3,152 @@ // Distributed under the Boost Software License, Version 1.0. (See accompanying // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) - #include "cl_tests.hpp" - /* * This test is meant to verify the kernel creation and execution functionality. */ CREATE_BUFFER(invalid_program_src, -" \n" -" __kernel void hello_world(__global char * in, __global char * out) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = (char)(in[unknown_variable] + tid); \n" -" } \n" -" \n"); + " " + " \n" + " __kernel void hello_world(__global char * in, __global char " + "* out) \n" + " { " + " \n" + " size_t tid = get_global_id(0); " + " \n" + " out[tid] = (char)(in[unknown_variable] + tid); " + " \n" + " } " + " \n" + " " + " \n"); CREATE_BUFFER(program_src, -" \n" -" __kernel void hello_world(__global char * in, __global char * out) \n" -" { \n" -" size_t tid = get_global_id(0); \n" -" out[tid] = (char)(in[tid] + tid); \n" -" } \n" -" \n"); - - + " " + " \n" + " __kernel void hello_world(__global char * in, __global char " + "* out) \n" + " { " + " \n" + " size_t tid = get_global_id(0); " + " \n" + " out[tid] = (char)(in[tid] + tid); " + " \n" + " } " + " \n" + " " + " \n"); #define DATASIZE (sizeof("Hello, World!")) -const char initdata_arr[] = { ('H' - static_cast( 0)), - ('e' - static_cast( 1)), - ('l' - static_cast( 2)), - ('l' - static_cast( 3)), - ('o' - static_cast( 4)), - (',' - static_cast( 5)), - (' ' - static_cast( 6)), - ('W' - static_cast( 7)), - ('o' - static_cast( 8)), - ('r' - static_cast( 9)), - ('l' - static_cast(10)), - ('d' - static_cast(11)), - ('!' - static_cast(12)), - ('\0' - static_cast(13)) }; -const char refdata2_arr[] = { ('H' + static_cast( 0)), - ('e' + static_cast( 1)), - ('l' + static_cast( 2)), - ('l' + static_cast( 3)), - ('o' + static_cast( 4)), - (',' + static_cast( 5)), - (' ' + static_cast( 6)), - ('W' + static_cast( 7)), - ('o' + static_cast( 8)), - ('r' + static_cast( 9)), - ('l' + static_cast(10)), - ('d' + static_cast(11)), - ('!' + static_cast(12)), - ('\0' + static_cast(13)) }; +const char initdata_arr[] = { + ('H' - static_cast(0)), ('e' - static_cast(1)), + ('l' - static_cast(2)), ('l' - static_cast(3)), + ('o' - static_cast(4)), (',' - static_cast(5)), + (' ' - static_cast(6)), ('W' - static_cast(7)), + ('o' - static_cast(8)), ('r' - static_cast(9)), + ('l' - static_cast(10)), ('d' - static_cast(11)), + ('!' - static_cast(12)), ('\0' - static_cast(13))}; +const char refdata2_arr[] = { + ('H' + static_cast(0)), ('e' + static_cast(1)), + ('l' + static_cast(2)), ('l' + static_cast(3)), + ('o' + static_cast(4)), (',' + static_cast(5)), + (' ' + static_cast(6)), ('W' + static_cast(7)), + ('o' + static_cast(8)), ('r' + static_cast(9)), + ('l' + static_cast(10)), ('d' + static_cast(11)), + ('!' + static_cast(12)), ('\0' + static_cast(13))}; CREATE_BUFFER(initdata, initdata_arr); CREATE_BUFFER(refdata1, "Hello, World!"); CREATE_BUFFER(refdata2, refdata2_arr); - - -hpx::opencl::program -remotely_create_program ( hpx::opencl::device device ) -{ - return device.create_program_with_source(program_src); +hpx::opencl::program remotely_create_program(hpx::opencl::device device) { + return device.create_program_with_source(program_src); } -hpx::opencl::kernel -remotely_create_kernel ( hpx::opencl::program program ) -{ - return program.create_kernel("hello_world"); +hpx::opencl::kernel remotely_create_kernel(hpx::opencl::program program) { + return program.create_kernel("hello_world"); } -hpx::opencl::buffer -remotely_create_buffer ( hpx::opencl::device device ) -{ - return device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); +hpx::opencl::buffer remotely_create_buffer(hpx::opencl::device device) { + return device.create_buffer(CL_MEM_READ_WRITE, DATASIZE); } HPX_PLAIN_ACTION(remotely_create_program, create_program_action); HPX_PLAIN_ACTION(remotely_create_kernel, create_kernel_action); HPX_PLAIN_ACTION(remotely_create_buffer, create_buffer_action); - -static void remote_test( hpx::opencl::device cldevice ) -{ - - // get location id - auto locality = hpx::get_colocation_id(hpx::launch::sync, cldevice.get_id()); - - // remotely create a program - hpx::opencl::program program = - hpx::async(locality, cldevice).get(); - - // build program - program.build(); - - // remotely create a kernel - hpx::opencl::kernel kernel = - hpx::async(locality, program).get(); - - // remotely create buffers - hpx::opencl::buffer buffer_src = - hpx::async(locality, cldevice).get(); - hpx::opencl::buffer buffer_dst = - hpx::async(locality, cldevice).get(); - - // test if buffer initialization worked - { - size_t buffer_src_size = buffer_src.size().get(); - HPX_TEST_EQ(buffer_src_size, DATASIZE); - size_t buffer_dst_size = buffer_dst.size().get(); - HPX_TEST_EQ(buffer_dst_size, DATASIZE); - } - - // set kernel arguments - { - auto future1 = kernel.set_arg_async(0, buffer_src); - kernel.set_arg(1, buffer_dst); - future1.get(); - } - - // set work dimensions - hpx::opencl::work_size<1> size; - size[0].offset = 0; - size[0].size = DATASIZE; - - // test if kernel can get executed (blocking) - { - // Initialize src buffer - buffer_src.enqueue_write(0, initdata).get(); - - // Execute - kernel.enqueue(size).get(); - - // Check for correct result - auto result_future = buffer_dst.enqueue_read(0, DATASIZE); - COMPARE_RESULT(result_future.get(), refdata1); - } - - // test if kernel can get executed (non-blocking) - { - // Send result of blocking execution to src buffer - auto fut1 = buffer_dst.enqueue_send(buffer_src, 0, 0, DATASIZE); - - // Execute - auto fut2 = kernel.enqueue(size, fut1.dst_future); - - // Read data - auto result_future = buffer_dst.enqueue_read(0, DATASIZE, fut2); - COMPARE_RESULT(result_future.get(), refdata2); - } - +static void remote_test(hpx::opencl::device cldevice) { + // get location id + auto locality = hpx::get_colocation_id(hpx::launch::sync, cldevice.get_id()); + + // remotely create a program + hpx::opencl::program program = + hpx::async(locality, cldevice).get(); + + // build program + program.build(); + + // remotely create a kernel + hpx::opencl::kernel kernel = + hpx::async(locality, program).get(); + + // remotely create buffers + hpx::opencl::buffer buffer_src = + hpx::async(locality, cldevice).get(); + hpx::opencl::buffer buffer_dst = + hpx::async(locality, cldevice).get(); + + // test if buffer initialization worked + { + size_t buffer_src_size = buffer_src.size().get(); + HPX_TEST_EQ(buffer_src_size, DATASIZE); + size_t buffer_dst_size = buffer_dst.size().get(); + HPX_TEST_EQ(buffer_dst_size, DATASIZE); + } + + // set kernel arguments + { + auto future1 = kernel.set_arg_async(0, buffer_src); + kernel.set_arg(1, buffer_dst); + future1.get(); + } + + // set work dimensions + hpx::opencl::work_size<1> size; + size[0].offset = 0; + size[0].size = DATASIZE; + + // test if kernel can get executed (blocking) + { + // Initialize src buffer + buffer_src.enqueue_write(0, initdata).get(); + + // Execute + kernel.enqueue(size).get(); + + // Check for correct result + auto result_future = buffer_dst.enqueue_read(0, DATASIZE); + COMPARE_RESULT(result_future.get(), refdata1); + } + + // test if kernel can get executed (non-blocking) + { + // Send result of blocking execution to src buffer + auto fut1 = buffer_dst.enqueue_send(buffer_src, 0, 0, DATASIZE); + + // Execute + auto fut2 = kernel.enqueue(size, fut1.dst_future); + + // Read data + auto result_future = buffer_dst.enqueue_read(0, DATASIZE, fut2); + COMPARE_RESULT(result_future.get(), refdata2); + } } -static void cl_test( hpx::opencl::device local_device, - hpx::opencl::device cldevice ) -{ - - remote_test(cldevice); - remote_test(local_device); - +static void cl_test(hpx::opencl::device local_device, + hpx::opencl::device cldevice) { + remote_test(cldevice); + remote_test(local_device); } - -