Problem
Following code is valid, but the result is wrong:
[cling]$ __device__ int deviceGlobal;
[cling]$
[cling]$ .rawInput 1
[cling]! __global__ void gKernel1(int * output){
[cling]! ? *output = deviceGlobal;
[cling]! ? }
[cling]! .rawInput 0
[cling]$
[cling]$ int * deviceGlobalPtr = nullptr;
[cling]$ cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal)
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ nullptr != deviceGlobalPtr
(bool) true
[cling]$
[cling]$ int hostInput = 42, hostOutput = 1;
[cling]$ int * deviceOutput;
[cling]$ cudaMalloc( (void **) &deviceOutput, sizeof(int))
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$
[cling]$ cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice)
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ gKernel1<<<1,1>>>(deviceOutput)
[cling]$ cudaGetLastError()
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ cudaDeviceSynchronize()
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$
[cling]$ cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ // should be true
[cling]$ hostInput == hostOutput
(bool) false
[cling]$ hostInput
(int) 42
[cling]$ hostOutput
(int) 0
[cling]$ .q
A second problem, which could be the source of this miss behavior is, that the function cudaGetSymbolAddress(void **devPtr, const void *symbol) isn't C/C++ conform.
The usage of this function is:
__device__ int deviceGlobal;
int main(){
int * deviceGlobalPtr;
cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal);
}
The header cuda/include/cuda_runtime_api.h (l. 119) describes, that some function calls have to be handled by the nvcc, because it is no valid C/C++.
Test code for clang and nvcc
#include <iostream>
__device__ int deviceGlobal;
__global__ void gKernel1(int * output){
*output = deviceGlobal;
}
__global__ void getDeviceGlobal(int ** address){
*address = &deviceGlobal;
}
void printError(cudaError_t error, std::string errMsg){
if(error != cudaSuccess)
std::cout << errMsg << cudaGetErrorString(error) << std::endl;
}
int main(int argc, char const *argv[])
{
int * deviceGlobalPtr = nullptr;
printError(
cudaGetSymbolAddress((void **)&deviceGlobalPtr, deviceGlobal),
"cudaGetSymbolAddress: ");
if(deviceGlobalPtr == nullptr)
std::cerr << "address of devicePtr is a nullptr" << std::endl;
int ** deviceGlobalAddr;
int * hostDeviceGlobalAddr = nullptr;
printError(
cudaMalloc( (void **) &deviceGlobalAddr, sizeof(int *)),
"cudaMalloc deviceGlobalAddr: ");
getDeviceGlobal<<<1,1>>>(deviceGlobalAddr);
printError(
cudaGetLastError(),
"cuda kernel launch: ");
printError(
cudaDeviceSynchronize(),
"cuda kernel error: ");
printError(
cudaMemcpy(&hostDeviceGlobalAddr, deviceGlobalAddr, sizeof(int *), cudaMemcpyDeviceToHost),
"cudaMemcpy device to host: ");
if(deviceGlobalPtr == hostDeviceGlobalAddr){
std::cout << "deviceGlobalPtr are deviceGlobalAddr equals" << std::endl
<< "deviceGlobalPtr: " << deviceGlobalPtr << std::endl
<< "hostDeviceGlobalAddr: " << hostDeviceGlobalAddr << std::endl;
}else{
std::cerr << "error: deviceGlobalPtr are hostDeviceGlobalAddr different" << std::endl
<< "deviceGlobalPtr: " << deviceGlobalPtr << std::endl
<< "hostDeviceGlobalAddr: " << hostDeviceGlobalAddr << std::endl;
}
int hostInput = 42, hostOutput = 1;
int * deviceOutput;
printError(
cudaMalloc( (void **) &deviceOutput, sizeof(int)),
"cudaMalloc: ");
printError(
cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice),
"cudaMemcpy host to global device: ");
gKernel1<<<1,1>>>(deviceOutput);
printError(
cudaGetLastError(),
"cuda kernel launch: ");
printError(
cudaDeviceSynchronize(),
"cuda kernel error: ");
printError(
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost),
"cudaMemcpy device to host: ");
if(hostInput == hostOutput){
std::cout << "all is fine hostInput and hostOutput are equal: " << hostInput << std::endl;
}else {
std::cerr << "error: hostInput and hostOutput are different:" << std::endl <<
hostInput << " != " << hostOutput << std::endl;
}
return 0;
}
Test code for cling
[cling]$ __device__ int deviceGlobal;
[cling]$
[cling]$ .rawInput 1
[cling]! __global__ void getDeviceGlobal(int ** address){
[cling]! ? *address = &deviceGlobal;
[cling]! ? }
[cling]! .rawInput 0
[cling]$
[cling]$ int * deviceGlobalPtr = nullptr;
[cling]$ cudaGetSymbolAddress((void **)&deviceGlobalPtr, deviceGlobal)
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ nullptr != deviceGlobalPtr
(bool) true
[cling]$
[cling]$ int ** deviceGlobalAddr;
[cling]$ int * hostDeviceGlobalAddr = nullptr;
[cling]$ cudaMalloc( (void **) &deviceGlobalAddr, sizeof(int *))
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$
[cling]$ getDeviceGlobal<<<1,1>>>(deviceGlobalAddr);
[cling]$ cudaGetLastError()
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ cudaDeviceSynchronize()
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$
[cling]$ cudaMemcpy(&hostDeviceGlobalAddr, deviceGlobalAddr, sizeof(int *), cudaMemcpyDeviceToHost)
(cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ // should be true
[cling]$ deviceGlobalPtr == hostDeviceGlobalAddr
Result of 3 runs
| variable |
address of 1st run |
address of 2nd run |
address of 3rd run |
| deviceGlobalPtr |
0x7f089c400200 |
0x7f8bc0c00200 |
0x7f77f4200000 |
| hostDeviceGlobalAddr |
0x7f089c400000 |
0x7f8bc0c00000 |
0x7f77f4200300 |
CUDA clang input
Source code, which will passed from cling to the clang nvptx
// cling2.cu
int deviceGlobal __attribute__((device));
// cling3.cu
__global__ void gKernel1(int * output){
*output = deviceGlobal;
}
// cling4.cu
int *deviceGlobalPtr = nullptr;
// cling5.cu
void __cling_Un1Qu32(void* vpClingValue) {
cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal)
;
}
// cling5.cu
#include "cling/Interpreter/RuntimePrintValue.h"
// cling6.cu
#include "cling/Interpreter/RuntimePrintValue.h"
// cling7.cu
int hostInput = 42;
int hostOutput = 1;
// cling8.cu
int *deviceOutput;
// cling9.cu
void __cling_Un1Qu37(void* vpClingValue) {
cudaMalloc( (void **) &deviceOutput, sizeof(int))
;
}
// cling10.cu
void __cling_Un1Qu38(void* vpClingValue) {
cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice)
;
}
// cling11.cu
void __cling_Un1Qu310(void* vpClingValue) {
gKernel1<<<1,1>>>(deviceOutput)
;
}
// cling12.cu
void __cling_Un1Qu311(void* vpClingValue) {
cudaGetLastError()
;
}
// cling13.cu
void __cling_Un1Qu312(void* vpClingValue) {
cudaDeviceSynchronize()
;
}
// cling14.cu
void __cling_Un1Qu313(void* vpClingValue) {
cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
;
}
Thinks to check
*
__device__ int deviceGlobal;
int main(){
int * deviceGlobalPtr;
cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal);
}
Problem
Following code is valid, but the result is wrong:
A second problem, which could be the source of this miss behavior is, that the function
cudaGetSymbolAddress(void **devPtr, const void *symbol)isn't C/C++ conform.The usage of this function is:
The header cuda/include/cuda_runtime_api.h (l. 119) describes, that some function calls have to be handled by the nvcc, because it is no valid C/C++.
Test code for clang and nvcc
Test code for cling
Result of 3 runs
CUDA clang input
Source code, which will passed from cling to the clang nvptx
Thinks to check
*