Skip to content

Bug: CUDA global __device__ memory doesn't work #5

@SimeonEhrig

Description

@SimeonEhrig

Problem

Following code is valid, but the result is wrong:

[cling]$ __device__ int deviceGlobal;
[cling]$ 
[cling]$ .rawInput 1
[cling]! __global__ void gKernel1(int * output){
[cling]! ?   *output = deviceGlobal;
[cling]! ?   }
[cling]! .rawInput 0
[cling]$ 
[cling]$ int * deviceGlobalPtr = nullptr;
[cling]$ cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ nullptr != deviceGlobalPtr
  (bool) true
[cling]$ 
[cling]$ int hostInput = 42, hostOutput = 1;
[cling]$ int * deviceOutput;
[cling]$ cudaMalloc( (void **) &deviceOutput, sizeof(int))
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ gKernel1<<<1,1>>>(deviceOutput)
[cling]$ cudaGetLastError()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ cudaDeviceSynchronize()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ // should be true
[cling]$ hostInput == hostOutput
  (bool) false
[cling]$ hostInput
  (int) 42
[cling]$ hostOutput
  (int) 0
[cling]$ .q

A second problem, which could be the source of this miss behavior is, that the function cudaGetSymbolAddress(void **devPtr, const void *symbol) isn't C/C++ conform.

The usage of this function is:

__device__ int deviceGlobal;

int main(){
  int * deviceGlobalPtr;
  cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal);
}

The header cuda/include/cuda_runtime_api.h (l. 119) describes, that some function calls have to be handled by the nvcc, because it is no valid C/C++.

Test code for clang and nvcc

#include <iostream>

__device__ int deviceGlobal;

__global__ void gKernel1(int * output){
	*output = deviceGlobal;
} 

__global__ void getDeviceGlobal(int ** address){
	*address = &deviceGlobal;
}

void printError(cudaError_t error, std::string errMsg){
	if(error != cudaSuccess)
		std::cout << errMsg << cudaGetErrorString(error) << std::endl;
}

int main(int argc, char const *argv[])
{	
	int * deviceGlobalPtr = nullptr;
	printError(
		cudaGetSymbolAddress((void **)&deviceGlobalPtr, deviceGlobal), 
		"cudaGetSymbolAddress: ");	 
	
	if(deviceGlobalPtr == nullptr)
		std::cerr << "address of devicePtr is a nullptr" << std::endl;

	int ** deviceGlobalAddr;
	int * hostDeviceGlobalAddr = nullptr;
	printError(
		cudaMalloc( (void **) &deviceGlobalAddr, sizeof(int *)),
		"cudaMalloc deviceGlobalAddr: ");
	
	getDeviceGlobal<<<1,1>>>(deviceGlobalAddr);
	printError(
		cudaGetLastError(),
		"cuda kernel launch: ");
	printError(
		cudaDeviceSynchronize(),
		"cuda kernel error: ");

	printError(
		cudaMemcpy(&hostDeviceGlobalAddr, deviceGlobalAddr, sizeof(int *), cudaMemcpyDeviceToHost),
		"cudaMemcpy device to host: ");

	if(deviceGlobalPtr == hostDeviceGlobalAddr){
		std::cout << "deviceGlobalPtr are deviceGlobalAddr equals" << std::endl
		<< "deviceGlobalPtr: " << deviceGlobalPtr << std::endl
		<< "hostDeviceGlobalAddr: " << hostDeviceGlobalAddr << std::endl;
	}else{
		std::cerr << "error: deviceGlobalPtr are hostDeviceGlobalAddr different" << std::endl
		<< "deviceGlobalPtr: " << deviceGlobalPtr << std::endl
		<< "hostDeviceGlobalAddr: " << hostDeviceGlobalAddr << std::endl;
	}

	int hostInput = 42, hostOutput = 1;
	int * deviceOutput;
	printError(
		cudaMalloc( (void **) &deviceOutput, sizeof(int)),
		"cudaMalloc: ");

	printError(
		cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice),
		"cudaMemcpy host to global device: ");

	gKernel1<<<1,1>>>(deviceOutput);
	printError(
		cudaGetLastError(),
		"cuda kernel launch: ");
	printError(
		cudaDeviceSynchronize(),
		"cuda kernel error: ");
	
	printError(
		cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost),
		"cudaMemcpy device to host: ");

	if(hostInput == hostOutput){
		std::cout << "all is fine hostInput and hostOutput are equal: " << hostInput << std::endl;
	}else {
		std::cerr << "error: hostInput and hostOutput are different:" << std::endl <<
		hostInput << " != " << hostOutput << std::endl;
	}

	return 0;
}

Test code for cling

[cling]$ __device__ int deviceGlobal;
[cling]$ 
[cling]$ .rawInput 1
[cling]! __global__ void getDeviceGlobal(int ** address){
[cling]! ?   *address = &deviceGlobal;
[cling]! ?   }
[cling]! .rawInput 0
[cling]$ 
[cling]$ int * deviceGlobalPtr = nullptr;
[cling]$ cudaGetSymbolAddress((void **)&deviceGlobalPtr, deviceGlobal)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ nullptr != deviceGlobalPtr
  (bool) true
[cling]$ 
[cling]$ int ** deviceGlobalAddr;
[cling]$ int * hostDeviceGlobalAddr = nullptr;
[cling]$ cudaMalloc( (void **) &deviceGlobalAddr, sizeof(int *))
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ getDeviceGlobal<<<1,1>>>(deviceGlobalAddr);
[cling]$ cudaGetLastError()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ cudaDeviceSynchronize()
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ 
[cling]$ cudaMemcpy(&hostDeviceGlobalAddr, deviceGlobalAddr, sizeof(int *), cudaMemcpyDeviceToHost)
  (cudaError_t) (cudaError::cudaSuccess) : (unsigned int) 0
[cling]$ // should be true
[cling]$ deviceGlobalPtr == hostDeviceGlobalAddr

Result of 3 runs

variable address of 1st run address of 2nd run address of 3rd run
deviceGlobalPtr 0x7f089c400200 0x7f8bc0c00200 0x7f77f4200000
hostDeviceGlobalAddr 0x7f089c400000 0x7f8bc0c00000 0x7f77f4200300

CUDA clang input

Source code, which will passed from cling to the clang nvptx

// cling2.cu
int deviceGlobal __attribute__((device));
// cling3.cu
__global__ void gKernel1(int * output){
*output = deviceGlobal;
}
// cling4.cu
int *deviceGlobalPtr = nullptr;
// cling5.cu
void __cling_Un1Qu32(void* vpClingValue) {
 cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal)
;
}
// cling5.cu
#include "cling/Interpreter/RuntimePrintValue.h"
// cling6.cu
#include "cling/Interpreter/RuntimePrintValue.h"
// cling7.cu
int hostInput = 42;
int hostOutput = 1;
// cling8.cu
int *deviceOutput;
// cling9.cu
void __cling_Un1Qu37(void* vpClingValue) {
 cudaMalloc( (void **) &deviceOutput, sizeof(int))
;
}
// cling10.cu
void __cling_Un1Qu38(void* vpClingValue) {
 cudaMemcpy(deviceGlobalPtr, &hostInput, sizeof(int), cudaMemcpyHostToDevice)
;
}
// cling11.cu
void __cling_Un1Qu310(void* vpClingValue) {
 gKernel1<<<1,1>>>(deviceOutput)
;
}
// cling12.cu
void __cling_Un1Qu311(void* vpClingValue) {
 cudaGetLastError()
;
}
// cling13.cu
void __cling_Un1Qu312(void* vpClingValue) {
 cudaDeviceSynchronize()
;
}
// cling14.cu
void __cling_Un1Qu313(void* vpClingValue) {
 cudaMemcpy(&hostOutput, deviceOutput, sizeof(int), cudaMemcpyDeviceToHost)
;
}

Thinks to check

  • llvm ir of cling-clang
  • llvm ir of clang
  • Does deviceGlobalPtr and hostDeviceGlobalAddr both live in device variable memory space? (compare with host stack variable)
  • llvm ir of simple cudaGetAddressSymbol *
  • use cudaGetSymbolAdress with C-Compiler
  • check clang cuda compiler headers (cudaGetSymbolAdress)

*

__device__ int deviceGlobal;

int main(){
  int * deviceGlobalPtr;
  cudaGetSymbolAddress((void**)&deviceGlobalPtr, deviceGlobal);
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    bugSomething isn't working

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions