Bug: CUDA global __device__ memory doesn't work

# Problem
Following code is valid, but the result is wrong:

``` C++
[cling]$ __device__ int deviceGlobal;
[cling]$ 
[cling]$ .rawInput 1
[cling]! __global__ void gKernel1(int * output){
[cling]! ?   *output = deviceGlobal;
[cling]! ?   }
[cling]! .rawInput 0
[cling]$ 
[cling]$ int * deviceGlobalPtr = nullptr;
[cling]$ cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ nullptr != deviceGlobalPtr
  (bool) true
[cling]$ 
[cling]$ int hostInput = 42, hostOutput = 1;
[cling]$ int * deviceOutput;
[cling]$ cudaMalloc( (void **) &deviceOutput, sizeof(int))
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ gKernel1<<<1,1>>>(deviceOutput)
[cling]$ cudaGetLastError()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ cudaDeviceSynchronize()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ // should be true
[cling]$ hostInput == hostOutput
  (bool) false
[cling]$ hostInput
  (int) 42
[cling]$ hostOutput
  (int) 0
[cling]$ .q
```

A second problem, which could be the source of this miss behavior is, that the function `cudaGetSymbolAddress(void **devPtr, const void *symbol)` isn't C/C++ conform. 

The usage of this function is:

``` C++
__device__ int deviceGlobal;

int main(){
  int * deviceGlobalPtr;
  cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal);
}
```
The header cuda/include/cuda_runtime_api.h (l. 119) describes, that some function calls have to be handled by the nvcc, because it is no valid C/C++.

# Test code for clang and nvcc 
``` C++
#include <iostream>

__device__ int deviceGlobal;

__global__ void gKernel1(int * output){
	*output = deviceGlobal;
} 

__global__ void getDeviceGlobal(int ** address){
	*address = &deviceGlobal;
}

void printError(cudaError_t error, std::string errMsg){
	if(error != cudaSuccess)
		std::cout << errMsg << cudaGetErrorString(error) << std::endl;
}

int main(int argc, char const *argv[])
{	
	int * deviceGlobalPtr = nullptr;
	printError(
		cudaGetSymbolAddress((void **)&deviceGlobalPtr, deviceGlobal), 
		"cudaGetSymbolAddress: ");	 
	
	if(deviceGlobalPtr == nullptr)
		std::cerr << "address of devicePtr is a nullptr" << std::endl;

	int ** deviceGlobalAddr;
	int * hostDeviceGlobalAddr = nullptr;
	printError(
		cudaMalloc( (void **) &deviceGlobalAddr, sizeof(int *)),
		"cudaMalloc deviceGlobalAddr: ");
	
	getDeviceGlobal<<<1,1>>>(deviceGlobalAddr);
	printError(
		cudaGetLastError(),
		"cuda kernel launch: ");
	printError(
		cudaDeviceSynchronize(),
		"cuda kernel error: ");

	printError(
		cudaMemcpy(&hostDeviceGlobalAddr, deviceGlobalAddr, sizeof(int *), cudaMemcpyDeviceToHost),
		"cudaMemcpy device to host: ");

	if(deviceGlobalPtr == hostDeviceGlobalAddr){
		std::cout << "deviceGlobalPtr are deviceGlobalAddr equals" << std::endl
		<< "deviceGlobalPtr: " << deviceGlobalPtr << std::endl
		<< "hostDeviceGlobalAddr: " << hostDeviceGlobalAddr << std::endl;
	}else{
		std::cerr << "error: deviceGlobalPtr are hostDeviceGlobalAddr different" << std::endl
		<< "deviceGlobalPtr: " << deviceGlobalPtr << std::endl
		<< "hostDeviceGlobalAddr: " << hostDeviceGlobalAddr << std::endl;
	}

	int hostInput = 42, hostOutput = 1;
	int * deviceOutput;
	printError(
		cudaMalloc( (void **) &deviceOutput, sizeof(int)),
		"cudaMalloc: ");

	printError(
		cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice),
		"cudaMemcpy host to global device: ");

	gKernel1<<<1,1>>>(deviceOutput);
	printError(
		cudaGetLastError(),
		"cuda kernel launch: ");
	printError(
		cudaDeviceSynchronize(),
		"cuda kernel error: ");
	
	printError(
		cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost),
		"cudaMemcpy device to host: ");

	if(hostInput == hostOutput){
		std::cout << "all is fine hostInput and hostOutput are equal: " << hostInput << std::endl;
	}else {
		std::cerr << "error: hostInput and hostOutput are different:" << std::endl <<
		hostInput << " != " << hostOutput << std::endl;
	}

	return 0;
}
```

# Test code for cling
``` C++
[cling]$ __device__ int deviceGlobal;
[cling]$ 
[cling]$ .rawInput 1
[cling]! __global__ void getDeviceGlobal(int ** address){
[cling]! ?   *address = &deviceGlobal;
[cling]! ?   }
[cling]! .rawInput 0
[cling]$ 
[cling]$ int * deviceGlobalPtr = nullptr;
[cling]$ cudaGetSymbolAddress((void **)&deviceGlobalPtr, deviceGlobal)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ nullptr != deviceGlobalPtr
  (bool) true
[cling]$ 
[cling]$ int ** deviceGlobalAddr;
[cling]$ int * hostDeviceGlobalAddr = nullptr;
[cling]$ cudaMalloc( (void **) &deviceGlobalAddr, sizeof(int *))
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ getDeviceGlobal<<<1,1>>>(deviceGlobalAddr);
[cling]$ cudaGetLastError()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ cudaDeviceSynchronize()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ cudaMemcpy(&hostDeviceGlobalAddr, deviceGlobalAddr, sizeof(int *), cudaMemcpyDeviceToHost)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ // should be true
[cling]$ deviceGlobalPtr == hostDeviceGlobalAddr
```

## Result of 3 runs

| variable | address of 1st run | address of 2nd run | address of 3rd run |
| ---------- | ----------------------- | ------------------------ | ----------------------- |
| deviceGlobalPtr | 0x7f089c400200 | 0x7f8bc0c00200 | 0x7f77f4200000 |
| hostDeviceGlobalAddr | 0x7f089c400000 | 0x7f8bc0c00000 | 0x7f77f4200300 |

# CUDA clang input
Source code, which will passed from cling to the clang nvptx

``` C++
// cling2.cu
int deviceGlobal __attribute__((device));
// cling3.cu
__global__ void gKernel1(int * output){
*output = deviceGlobal;
}
// cling4.cu
int *deviceGlobalPtr = nullptr;
// cling5.cu
void __cling_Un1Qu32(void* vpClingValue) {
 cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal)
;
}
// cling5.cu
#include "cling/Interpreter/RuntimePrintValue.h"
// cling6.cu
#include "cling/Interpreter/RuntimePrintValue.h"
// cling7.cu
int hostInput = 42;
int hostOutput = 1;
// cling8.cu
int *deviceOutput;
// cling9.cu
void __cling_Un1Qu37(void* vpClingValue) {
 cudaMalloc( (void **) &deviceOutput, sizeof(int))
;
}
// cling10.cu
void __cling_Un1Qu38(void* vpClingValue) {
 cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice)
;
}
// cling11.cu
void __cling_Un1Qu310(void* vpClingValue) {
 gKernel1<<<1,1>>>(deviceOutput)
;
}
// cling12.cu
void __cling_Un1Qu311(void* vpClingValue) {
 cudaGetLastError()
;
}
// cling13.cu
void __cling_Un1Qu312(void* vpClingValue) {
 cudaDeviceSynchronize()
;
}
// cling14.cu
void __cling_Un1Qu313(void* vpClingValue) {
 cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
;
}
```

# Thinks to check
- [ ] llvm ir of cling-clang
- [ ] llvm ir of clang
- [ ] Does deviceGlobalPtr and hostDeviceGlobalAddr both live in device variable memory space? (compare with host stack variable)
- [ ] llvm ir of simple cudaGetAddressSymbol *
- [ ] use cudaGetSymbolAdress with C-Compiler
- [ ] check clang cuda compiler headers (cudaGetSymbolAdress)

\*
``` C++
__device__ int deviceGlobal;

int main(){
  int * deviceGlobalPtr;
  cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal);
}
```


Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Bug: CUDA global device memory doesn't work #5

Problem

Test code for clang and nvcc

Test code for cling

Result of 3 runs

CUDA clang input

Thinks to check

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

variable	address of 1st run	address of 2nd run	address of 3rd run
deviceGlobalPtr	0x7f089c400200	0x7f8bc0c00200	0x7f77f4200000
hostDeviceGlobalAddr	0x7f089c400000	0x7f8bc0c00000	0x7f77f4200300

Bug: CUDA global __device__ memory doesn't work #5

Description

Problem

Test code for clang and nvcc

Test code for cling

Result of 3 runs

CUDA clang input

Thinks to check

Metadata

Metadata

Assignees

Labels

Projects

Milestone

Relationships

Development

Issue actions

Bug: CUDA global device memory doesn't work #5