Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
278 changes: 269 additions & 9 deletions README.md

Large diffs are not rendered by default.

Binary file added img/Stream_compaction.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/UpSweep.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/impact_by_array_size.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added img/impact_by_block_size.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
51 changes: 50 additions & 1 deletion src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
#include <stream_compaction/thrust.h>
#include "testing_helpers.hpp"

const int SIZE = 1 << 8; // feel free to change the size of array
//const int SIZE = 1 << 8; // feel free to change the size of array

const int SIZE = 1 << 24; // feel free to change the size of array
const int NPOT = SIZE - 3; // Non-Power-Of-Two
int *a = new int[SIZE];
int *b = new int[SIZE];
Expand Down Expand Up @@ -147,6 +149,53 @@ int main(int argc, char* argv[]) {
//printArray(count, c, true);
printCmpLenResult(count, expectedNPOT, b, c);

//////////////////////////////////////////////////////////////

// ### Extra Credit Feature Tests ###

printf("\n");
printf("*****************************\n");
printf("** EXTRA CREDIT TESTS **\n");
printf("*****************************\n");

// Radix sort test
srand(time(nullptr)); // Seed random number generator
for (int i = 0; i < SIZE; ++i) {
a[i] = rand() % 100 - 50; // random int including negatives:contentReference[oaicite:24]{index=24}
}
printArray(SIZE, a, true); // Print input array (abridged) for reference
for (int i = 0; i < SIZE; ++i) {
b[i] = a[i]; // Copy input to b for CPU sorting
}
std::sort(b, b + SIZE); // CPU sort (std::sort) for reference output
zeroArray(SIZE, c); // Zero out output array for GPU
printDesc("radix sort"); // ==== radix sort ====:contentReference[oaicite:25]{index=25}
StreamCompaction::Radix::sort(SIZE, c, a); // GPU radix sort on input array
printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(),
"(CUDA Measured)"); // GPU execution time (ms)
//printArray(SIZE, c, true); // (Optional) Debug: print sorted output array
printCmpResult(SIZE, b, c); // Compare GPU result (c) vs CPU result (b):contentReference[oaicite:26]{index=26}

// Shared scan test
genArray(SIZE, a, 50); // Generate new random array of length SIZE:contentReference[oaicite:27]{index=27}
printArray(SIZE, a, true); // Print input array (abridged)
zeroArray(SIZE, b);
StreamCompaction::CPU::scan(SIZE, b, a); // CPU exclusive scan on input array
printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(),
"(std::chrono Measured)"); // CPU scan time (ms):contentReference[oaicite:28]{index=28}
//printArray(SIZE, b, true); // (Optional) Debug: print CPU scan output
zeroArray(SIZE, c);
printDesc("shared scan"); // ==== shared scan ====:contentReference[oaicite:29]{index=29}
StreamCompaction::Shared::scan(SIZE, c, a); // GPU shared-memory exclusive scan
printElapsedTime(StreamCompaction::Shared::timer().getGpuElapsedTimeForPreviousOperation(),
"(CUDA Measured)"); // GPU execution time (ms)
//printArray(SIZE, c, true); // (Optional) Debug: print GPU scan output
printCmpResult(SIZE, b, c); // Compare GPU result (c) vs CPU result (b)

// (End of extra credit tests)

//////////////////////////////////////////////////////////

system("pause"); // stop Win32 console from closing on exit
delete[] a;
delete[] b;
Expand Down
2 changes: 1 addition & 1 deletion stream_compaction/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set(headers
"naive.h"
"efficient.h"
"thrust.h"
)
)

set(sources
"common.cu"
Expand Down
13 changes: 13 additions & 0 deletions stream_compaction/common.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,11 @@ namespace StreamCompaction {
*/
__global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
// TODO

int idx = threadIdx.x + (blockIdx.x * blockDim.x); // map from threadIdx/blockIdx to element idx
if (idx >= n) return; // if idx is out of bounds, return
bools[idx] = (idata[idx] != 0) ? 1 : 0; // map to 1 if idata[idx] is non-zero, else map to 0

}

/**
Expand All @@ -33,6 +38,14 @@ namespace StreamCompaction {
__global__ void kernScatter(int n, int *odata,
const int *idata, const int *bools, const int *indices) {
// TODO

int idx = threadIdx.x + (blockIdx.x * blockDim.x); // map from threadIdx/blockIdx to element idx
if (idx >= n) return; // if idx is out of bounds, return

// if bools[idx] is 1, copy idata[idx] to odata[indices[idx]]
if (bools[idx] == 1) {
odata[indices[idx]] = idata[idx]; // scatter
}
}

}
Expand Down
76 changes: 73 additions & 3 deletions stream_compaction/cpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,23 @@ namespace StreamCompaction {
void scan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO
timer().endCpuTimer();

// exclusive scan
if (n <= 0) {

// Empty input, nothing to do
timer().endCpuTimer();
return;
}

odata[0] = 0; // first element is always 0 for exclusive scan

// compute the rest of the elements
for (int i = 1; i < n; ++i) {
odata[i] = odata[i - 1] + idata[i - 1]; // exclusive scan
}

timer().endCpuTimer();
}

/**
Expand All @@ -31,10 +47,26 @@ namespace StreamCompaction {
int compactWithoutScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO

int count = 0; // number of non-zero elements

// iterate through input array
for (int i = 0; i < n; ++i) {

if (idata[i] != 0) { // keep only non-zero elements
odata[count] = idata[i]; // write to output array
count++; // increment count
}
}

timer().endCpuTimer();
return -1;

return count; // return number of non-zero elements
//return -1;
}



/**
* CPU stream compaction using scan and scatter, like the parallel version.
*
Expand All @@ -43,8 +75,46 @@ namespace StreamCompaction {
int compactWithScan(int n, int *odata, const int *idata) {
timer().startCpuTimer();
// TODO

// Handle edge case of empty input
if (n <= 0) {
timer().endCpuTimer();
return 0;
}

// Map to boolean array (1 for non-zero, 0 for zero)
int* bools = new int[n];
for (int i = 0; i < n; ++i) {
bools[i] = (idata[i] != 0) ? 1 : 0;
}

// Exclusive prefix sum (scan) on the boolean array
int* indices = new int[n]; // to hold the scanned indices
indices[0] = 0; // first element is always 0 for exclusive scan

// Compute the rest of the elements
for (int i = 1; i < n; ++i) {
indices[i] = indices[i - 1] + bools[i - 1]; // exclusive scan
}

// Compute total count of non-zero elements
int count = indices[n - 1] + bools[n - 1];

// Scatter - Write all non-zero elements to odata at computed indices
for (int i = 0; i < n; ++i) {

// If the element is non-zero, write it to the output array at the scanned index
if (bools[i] == 1) {
odata[indices[i]] = idata[i]; // scatter
}
}

delete[] bools; // free temporary boolean array
delete[] indices; // free temporary indices array

timer().endCpuTimer();
return -1;
return count; // return number of non-zero elements
//return -1;
}
}
}
Loading