Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions cuda/abs.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
// dst[i] = abs(a[i])

extern "C" __global__ void
unary_abs(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = fabsf(a[i]);
}
}
16 changes: 16 additions & 0 deletions cuda/acos.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// dst[i] = acos(a[i]), returns 0 for a[i] outside of domain [-1, 1]

extern "C" __global__ void
unary_acos(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
float x = a[i];
if (x >= -1.0f && x <= 1.0f){
dst[i] = acosf(x);
}
else{
dst[i] = 0.0f;
}
}
}
15 changes: 15 additions & 0 deletions cuda/acosh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
//dst[i] = acosh(a[i]), returns 0 for a[i] outside of domain [1, inf)

extern "C" __global__ void
unary_acosh(float* __restrict__ dst, float* __restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
float x = a[i];
if (x >= 1.0f) {
dst[i] = acoshf(x);
} else {
dst[i] = 0.0f;
}
}
}
15 changes: 15 additions & 0 deletions cuda/asin.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// dst[i] = asin(a[i]), returns 0 for a[i] outside of domain [-1, 1]

extern "C" __global__ void
unary_asin(float* __restrict__ dst, float* __restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
float x = a[i];
if (x >= -1.0f && x <= 1.0f) {
dst[i] = asinf(x);
} else {
dst[i] = 0.0f;
}
}
}
10 changes: 10 additions & 0 deletions cuda/asinh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//dst[i] = asinh(a[i])

extern "C" __global__ void
unary_asinh(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = asinhf(a[i]);
}
}
10 changes: 10 additions & 0 deletions cuda/atan.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//dst[i] = atan(a[i])

extern "C" __global__ void
unary_atan(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = atanf(a[i]);
}
}
10 changes: 10 additions & 0 deletions cuda/atan2.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//dst[i] = atan2(a[i], b[i])

extern "C" __global__ void
pw_atan2(float *__restrict__ dst, float *__restrict__ a, float *__restrict__ b, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = atan2f(a[i], b[i]);
}
}
16 changes: 16 additions & 0 deletions cuda/atanh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
// dst[i] = atanh(a[i]), returns 0 for a[i] outside of domain (-1, 1)

extern "C" __global__ void
unary_atanh(float* __restrict__ dst, float* __restrict__ a, int N) {

int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
float x = a[i];
if (x > -1.0f && x < 1.0f) {
dst[i] = atanhf(x);
} else {
dst[i] = 0.0f;
}
}
}
9 changes: 9 additions & 0 deletions cuda/cos.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = cos(a[i])
extern "C" __global__ void
unary_cos(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = cosf(a[i]);
}
}
9 changes: 9 additions & 0 deletions cuda/cosh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = cosh(a[i])
extern "C" __global__ void
unary_cosh(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = coshf(a[i]);
}
}
9 changes: 9 additions & 0 deletions cuda/erf.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = erf(a[i])
extern "C" __global__ void
unary_erf(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = erff(a[i]);
}
}
9 changes: 9 additions & 0 deletions cuda/erfc.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = erfc(a[i])
extern "C" __global__ void
unary_erfc(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = erfcf(a[i]);
}
}
9 changes: 9 additions & 0 deletions cuda/exp.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = exp(a[i])
extern "C" __global__ void
unary_exp(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = expf(a[i]);
}
}
15 changes: 15 additions & 0 deletions cuda/gamma.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
// dst[i] = gamma(a[i]), returns 0 for x <= 0, returns 0 for a[i] outside of domain (0, inf) with poles at non-positive integers
extern "C" __global__ void
unary_gamma(float* __restrict__ dst, float* __restrict__ a, int N) {

int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
float x = a[i];
if (x > 0.0f) {
dst[i] = tgammaf(x);
} else {
dst[i] = 0.0f;
}
}
}
19 changes: 19 additions & 0 deletions cuda/heaviside.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
//dst[i] = heaviside(a[i])
// returns 0 for x < 0, 0.5 for x == 0, 1 for x > 0
extern "C" __global__ void
unary_heaviside(float* __restrict__ dst, float* __restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
float x = a[i];

if (x > 0.0f) {
dst[i] = 1.0f;
} else if (x < 0.0f) {
dst[i] = 0.0f;
} else {
// x == 0
dst[i] = 0.5f;
}
}
}
13 changes: 13 additions & 0 deletions cuda/log.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
// dst[i] = log(a[i]), returns 0 for non-positive input
extern "C" __global__ void
unary_log(float* __restrict__ dst, float* __restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
if (a[i] > 0.0f) {
dst[i] = logf(a[i]);
} else {
dst[i] = 0.0f;
}
}
}
10 changes: 10 additions & 0 deletions cuda/mod.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//dst[i] = mod(a[i], b[i])
extern "C" __global__ void
pw_mod(float *__restrict__ dst, float *__restrict__ a, float *__restrict__ b, int N)
{
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;
if (i < N)
{
dst[i] = fmodf(a[i], b[i]);
}
}
28 changes: 28 additions & 0 deletions cuda/pow.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
// dst[i] = pow(a[i], b[i]), pow(a,b) for negative a and b returns -pow(-a,b) for fractional exponents, and for 0^0 returns 1
extern "C" __global__ void
pw_pow(float* __restrict__ dst, float* __restrict__ a, float* __restrict__ b, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
float x = a[i];
float y = b[i];

// Guard: 0^negative → return 0
if (x == 0.0f && y < 0.0f) {
dst[i] = 0.0f;
return;
}

// Check if exponent is effectively integer
float y_floor = floorf(y);
bool y_is_int = fabsf(y - y_floor) < 1e-6f;

if (x < 0.0f && !y_is_int) {
// Fractional exponent → move minus sign outside
dst[i] = -1.0f * powf(fabsf(x), y);
} else {
// Integer exponent or positive base
dst[i] = powf(x, y);
}
}
}
10 changes: 10 additions & 0 deletions cuda/sin.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//dst[i] = sin(a[i])
extern "C" __global__ void
unary_sin(float *__restrict__ dst, float *__restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N)
{
dst[i] = sinf(a[i]);
}
}
11 changes: 11 additions & 0 deletions cuda/sinc.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
// dst[i] = sinc(a[i])
extern "C" __global__ void
unary_sinc(float *__restrict__ dst, float *__restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N)
{
float x = a[i];
dst[i] = (x == 0.0f) ? 1.0f : sinf(x) / x;
}
}
10 changes: 10 additions & 0 deletions cuda/sinh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
//dst[i] = sinh(a[i])
extern "C" __global__ void
unary_sinh(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N)
{
dst[i] = sinhf(a[i]);
}
}
9 changes: 9 additions & 0 deletions cuda/tan.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = tan(a[i]), for poles at π/2 + nπ, returns 0
extern "C" __global__ void
unary_tan(float* __restrict__ dst, float* __restrict__ a, int N) {
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N) {
dst[i] = tanf(a[i]);
}
}
9 changes: 9 additions & 0 deletions cuda/tanh.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
//dst[i] = tanh(a[i])
extern "C" __global__ void
unary_tanh(float *__restrict__ dst, float *__restrict__ a, int N){
int i = (blockIdx.y * gridDim.x + blockIdx.x) * blockDim.x + threadIdx.x;

if (i < N){
dst[i] = tanhf(a[i]);
}
}
Loading