From 2909a4430c2e9fb804ca86053777ba6b9166bf2e Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Tue, 9 Sep 2025 21:17:51 -0400
Subject: [PATCH 01/13] Part 1 Complete

---
 stream_compaction/cpu.cu | 62 ++++++++++++++++++++++++++++++++++++++--
 1 file changed, 60 insertions(+), 2 deletions(-)

diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa115..7db26fed 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -20,6 +20,12 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+            //Note, exclusive scan
+            odata[0] = 0;
+            for (int i = 1; i < n; i++)
+            {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -31,8 +37,22 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+
+            //Points to the next spot for inserting into odata
+            int oDataIndex = 0;
+
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] != 0)
+                {
+                    odata[oDataIndex] = idata[i];
+                    oDataIndex++;
+                }
+            }
+
+
             timer().endCpuTimer();
-            return -1;
+            return oDataIndex;
         }
 
         /**
@@ -43,8 +63,46 @@ namespace StreamCompaction {
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
             // TODO
+
+            //Step 1: create bool array
+            int* boolArray = new int[n] {-1};
+            for (int i = 0; i < n; i++)
+            {
+                if (idata[i] == 0)
+                {
+                    boolArray[i] = 0;
+                }
+                else
+                {
+                    boolArray[i] = 1;
+                }
+            }
+            //Step 2: Scan bool array
+            //Note: not using scan since it's being timed with the same timer.
+            int* scanArray = new int[n];
+            scanArray[0] = 0;
+            for (int i = 1; i < n; i++)
+            {
+                scanArray[i] = scanArray[i - 1] + boolArray[i - 1];
+            }
+
+            //Step 3: Scatter
+            int count = 0;
+            for (int i = 0; i < n; i++)
+            {
+                if (boolArray[i] == 1)
+                {
+                    count++;
+                    odata[scanArray[i]] = idata[i];
+                }
+            }
+
+
+           
+            delete[] boolArray;
+            delete[] scanArray;
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
     }
 }

From 17824ff1c5b2a095cef50f593a87422d26e44c1e Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Tue, 9 Sep 2025 22:13:12 -0400
Subject: [PATCH 02/13] part 2 started

---
 stream_compaction/naive.cu | 44 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 43088769..3a5d442f 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,14 +11,58 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+        //Starting Input
+        int* dev_arrA;
+        //Starting Output
+        int* dev_arrB;
+
+
         // TODO: __global__
 
+        __global__ void naiveScan(int n, int* odata, const int* idata, int stride)
+        {
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index >= pow(2, stride - 1))
+            {
+                odata[index] = idata[index] + idata[index - (int)pow(2, stride - 1)];
+            }
+            else
+            {
+                odata[index] = idata[index];
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            cudaMalloc((void**)dev_arrA, sizeof(int) * n);
+            cudaMalloc((void**)dev_arrB, sizeof(int) * n);
+
+            cudaMemcpy(dev_arrA, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            int threadsPerBlock = 128;
+            dim3 totalBlocks ((n + threadsPerBlock - 1) / threadsPerBlock);
+
+            int log2Ceil = ilog2ceil(n);
+            for (int i = 0; i < log2Ceil; i++)
+            {
+                naiveScan << <totalBlocks, threadsPerBlock >> > (n, dev_arrA, dev_arrB, i);
+                std::swap(dev_arrA, dev_arrB);
+            }
+            
+            if (log2Ceil % 2 == 1)
+            {
+                std::swap(dev_arrA, dev_arrB);
+            }
+
+            cudaMemcpy(odata, dev_arrB, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_arrA);
+            cudaFree(dev_arrB);
+
             timer().endGpuTimer();
         }
     }

From b36934196425565d6535a68d00a5026daf1cc0a0 Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Wed, 10 Sep 2025 08:09:02 -0400
Subject: [PATCH 03/13] Part 2 Complete

---
 stream_compaction/naive.cu | 50 ++++++++++++++++++++++++++------------
 1 file changed, 35 insertions(+), 15 deletions(-)

diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 3a5d442f..bf242a36 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,10 +11,7 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        //Starting Input
-        int* dev_arrA;
-        //Starting Output
-        int* dev_arrB;
+        
 
 
         // TODO: __global__
@@ -22,26 +19,49 @@ namespace StreamCompaction {
         __global__ void naiveScan(int n, int* odata, const int* idata, int stride)
         {
             int index = threadIdx.x + blockDim.x * blockIdx.x;
-            if (index >= pow(2, stride - 1))
+            if (index < n)
             {
-                odata[index] = idata[index] + idata[index - (int)pow(2, stride - 1)];
+                if (index >= pow(2, stride))
+                {
+                    odata[index] = idata[index] + idata[index - (int)pow(2, stride)];
+                }
+                else
+                {
+                    odata[index] = idata[index];
+                }
             }
-            else
+        }
+
+        __global__ void inclusiveToExclusive(int n, int* odata, const int* idata)
+        {
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index == 0)
             {
-                odata[index] = idata[index];
+                odata[0] = 0;
+            }
+            else if (index < n)
+            {
+                odata[index] = idata[index - 1];
             }
         }
 
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
-            cudaMalloc((void**)dev_arrA, sizeof(int) * n);
-            cudaMalloc((void**)dev_arrB, sizeof(int) * n);
+            //Starting Input
+            int *dev_arrA;
+            //Starting Output
+            int *dev_arrB;
+
+            cudaMalloc((void**)&dev_arrA, sizeof(int) * n);
+            cudaMalloc((void**)&dev_arrB, sizeof(int) * n);
 
             cudaMemcpy(dev_arrA, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            
 
             int threadsPerBlock = 128;
             dim3 totalBlocks ((n + threadsPerBlock - 1) / threadsPerBlock);
@@ -49,16 +69,16 @@ namespace StreamCompaction {
             int log2Ceil = ilog2ceil(n);
             for (int i = 0; i < log2Ceil; i++)
             {
-                naiveScan << <totalBlocks, threadsPerBlock >> > (n, dev_arrA, dev_arrB, i);
+                naiveScan << <totalBlocks, threadsPerBlock >> > (n, dev_arrB, dev_arrA, i);
                 std::swap(dev_arrA, dev_arrB);
             }
             
-            if (log2Ceil % 2 == 1)
-            {
-                std::swap(dev_arrA, dev_arrB);
-            }
+            inclusiveToExclusive <<<totalBlocks, threadsPerBlock >>> (n, dev_arrB, dev_arrA);
+            
 
             cudaMemcpy(odata, dev_arrB, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            
+
 
             cudaFree(dev_arrA);
             cudaFree(dev_arrB);

From 88f881db96e78addb6630da94b5401cd18a73a34 Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Wed, 10 Sep 2025 09:06:26 -0400
Subject: [PATCH 04/13] Progress towards part 3

---
 stream_compaction/efficient.cu | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346ee..0eedc858 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -11,6 +11,22 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+        
+        __global__ void UpSweep(int n, int* data, int stride)
+        {
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index < n)
+            {
+                //Pavel TODO: devise method so that I don't have to create threads equal to the number of elements
+                if (index + 1 % (int)pow(2, stride + 2) == 0)
+                {
+                    data[index] += data[index - (stride + 1)];
+                }
+            }
+        }
+        __global__ void DownScan(int n, int* data, int stride)
+        {
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.

From f14d4edb2c4c254a039c7a11b784e8b9a9d35dba Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Mon, 15 Sep 2025 11:11:37 -0400
Subject: [PATCH 05/13] More work towards part 3

---
 stream_compaction/efficient.cu | 36 +++++++++++++++++++++++++++-------
 stream_compaction/naive.cu     |  4 ++--
 2 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 0eedc858..8ff7ef80 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,20 +12,42 @@ namespace StreamCompaction {
             return timer;
         }
         
-        __global__ void UpSweep(int n, int* data, int stride)
+        __device__ void UpSweep(int index, int* data, int iLogCeil)
         {
-            int index = threadIdx.x + blockDim.x * blockIdx.x;
-            if (index < n)
+            
+            for (int d = 0; d < iLogCeil; d++)
             {
-                //Pavel TODO: devise method so that I don't have to create threads equal to the number of elements
-                if (index + 1 % (int)pow(2, stride + 2) == 0)
+                if ((index + 1) % (int)powf(2, d + 1))
                 {
-                    data[index] += data[index - (stride + 1)];
+                    data[index] += data[index - (int)powf(2, d)];
                 }
+                cudaDeviceSynchronize();
             }
+            
         }
-        __global__ void DownScan(int n, int* data, int stride)
+        __device__ void DownSweep(int n, int index, int* data, int iLogCeil)
         {
+            if (index == n - 1)
+            {
+                data[index] = 0;
+            }
+            
+            for (int d = iLogCeil; d >= 0; d--)
+            {
+                if ((index + 1) % (int)powf(2, d + 1))
+                {
+                    int t = data[index + (int)powf(2, d) - 1];
+                    data[index + (int)powf(2, d) - 1] = data[index + (int)powf(2, d + 1)];
+                    data[index + (int)powf(2, d + 1)] += t;
+                }
+                cudaDeviceSynchronize();
+            }
+
+        }
+
+        __global__ void kernEfficientSwap(int n, int* odata, const int* idata)
+        {
+
         }
 
         /**
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index bf242a36..a3bb5fe5 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -21,9 +21,9 @@ namespace StreamCompaction {
             int index = threadIdx.x + blockDim.x * blockIdx.x;
             if (index < n)
             {
-                if (index >= pow(2, stride))
+                if (index >= powf(2, stride))
                 {
-                    odata[index] = idata[index] + idata[index - (int)pow(2, stride)];
+                    odata[index] = idata[index] + idata[index - (int)powf(2, stride)];
                 }
                 else
                 {

From 2d331be7557925290ca0a55bd2142dbec14dcaf9 Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Mon, 15 Sep 2025 12:02:11 -0400
Subject: [PATCH 06/13] Part 3 Progress

---
 stream_compaction/efficient.cu | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 8ff7ef80..5b930530 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -45,7 +45,7 @@ namespace StreamCompaction {
 
         }
 
-        __global__ void kernEfficientSwap(int n, int* odata, const int* idata)
+        __global__ void kernEfficientSwap(int n, int* data)
         {
 
         }
@@ -56,6 +56,12 @@ namespace StreamCompaction {
         void scan(int n, int *odata, const int *idata) {
             timer().startGpuTimer();
             // TODO
+            int* dev_arr;
+
+            int log2Ceil = ilog2ceil(n);
+            int arraySize = powf(2, n);
+
+            cudaMalloc((void**)&dev_arr, sizeof(int) * n);
             timer().endGpuTimer();
         }
 

From 88fa25d5551f6f24bac8c690017ca4a3e94e516d Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Mon, 15 Sep 2025 16:54:25 -0400
Subject: [PATCH 07/13] Part 3 progress

---
 stream_compaction/efficient.cu | 53 ++++++++++++++++++++++++----------
 stream_compaction/naive.cu     | 16 +++++-----
 2 files changed, 47 insertions(+), 22 deletions(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 5b930530..b7df71ec 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,27 +12,24 @@ namespace StreamCompaction {
             return timer;
         }
         
-        __device__ void UpSweep(int index, int* data, int iLogCeil)
+        __device__ void upSweep(int index, int* data, int log2Ceil)
         {
             
-            for (int d = 0; d < iLogCeil; d++)
+            for (int d = 0; d < log2Ceil; d++)
             {
                 if ((index + 1) % (int)powf(2, d + 1))
                 {
                     data[index] += data[index - (int)powf(2, d)];
                 }
-                cudaDeviceSynchronize();
+                __syncthreads();
             }
             
         }
-        __device__ void DownSweep(int n, int index, int* data, int iLogCeil)
+        __device__ void downSweep(int index, int* data, int log2Ceil)
         {
-            if (index == n - 1)
-            {
-                data[index] = 0;
-            }
             
-            for (int d = iLogCeil; d >= 0; d--)
+            
+            for (int d = log2Ceil; d >= 0; d--)
             {
                 if ((index + 1) % (int)powf(2, d + 1))
                 {
@@ -40,29 +37,55 @@ namespace StreamCompaction {
                     data[index + (int)powf(2, d) - 1] = data[index + (int)powf(2, d + 1)];
                     data[index + (int)powf(2, d + 1)] += t;
                 }
-                cudaDeviceSynchronize();
+                __syncthreads();
             }
 
         }
 
-        __global__ void kernEfficientSwap(int n, int* data)
+        __global__ void kernEfficientScan(int n, int* data, int log2Ceil)
         {
-
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index < n)
+            {
+                
+                upSweep(index, data, log2Ceil);
+                if (index == n - 1)
+                {
+                    data[index] = 0;
+                }
+                downSweep(index, data, log2Ceil);
+            }
         }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+           
             // TODO
             int* dev_arr;
 
             int log2Ceil = ilog2ceil(n);
-            int arraySize = powf(2, n);
+            int arraySize = powf(2, log2Ceil);
+
+            cudaMalloc((void**)&dev_arr, sizeof(int) * arraySize);
+            //Make sure that the array is filled with 0s to start with
+            cudaMemset(dev_arr, 0, sizeof(int) * arraySize);
+            cudaMemcpy(dev_arr, idata, sizeof(int) * arraySize, cudaMemcpyHostToDevice);
+
+            int threadsPerBlock = 128;
+            dim3 totalBlocks((arraySize + threadsPerBlock - 1) / threadsPerBlock);
+
+            timer().startGpuTimer();
+
+            kernEfficientScan << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, log2Ceil);
 
-            cudaMalloc((void**)&dev_arr, sizeof(int) * n);
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            cudaFree(dev_arr);
+
+            
         }
 
         /**
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index a3bb5fe5..7f76d2ee 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -16,7 +16,7 @@ namespace StreamCompaction {
 
         // TODO: __global__
 
-        __global__ void naiveScan(int n, int* odata, const int* idata, int stride)
+        __global__ void kernNaiveScan(int n, int* odata, const int* idata, int stride)
         {
             int index = threadIdx.x + blockDim.x * blockIdx.x;
             if (index < n)
@@ -32,7 +32,7 @@ namespace StreamCompaction {
             }
         }
 
-        __global__ void inclusiveToExclusive(int n, int* odata, const int* idata)
+        __global__ void kernInclusiveToExclusive(int n, int* odata, const int* idata)
         {
             int index = threadIdx.x + blockDim.x * blockIdx.x;
             if (index == 0)
@@ -50,7 +50,7 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
+            
             // TODO
             //Starting Input
             int *dev_arrA;
@@ -67,14 +67,16 @@ namespace StreamCompaction {
             dim3 totalBlocks ((n + threadsPerBlock - 1) / threadsPerBlock);
 
             int log2Ceil = ilog2ceil(n);
+
+            timer().startGpuTimer();
             for (int i = 0; i < log2Ceil; i++)
             {
-                naiveScan << <totalBlocks, threadsPerBlock >> > (n, dev_arrB, dev_arrA, i);
+                kernNaiveScan << <totalBlocks, threadsPerBlock >> > (n, dev_arrB, dev_arrA, i);
                 std::swap(dev_arrA, dev_arrB);
             }
             
-            inclusiveToExclusive <<<totalBlocks, threadsPerBlock >>> (n, dev_arrB, dev_arrA);
-            
+            kernInclusiveToExclusive <<<totalBlocks, threadsPerBlock >>> (n, dev_arrB, dev_arrA);
+            timer().endGpuTimer();
 
             cudaMemcpy(odata, dev_arrB, sizeof(int) * n, cudaMemcpyDeviceToHost);
             
@@ -83,7 +85,7 @@ namespace StreamCompaction {
             cudaFree(dev_arrA);
             cudaFree(dev_arrB);
 
-            timer().endGpuTimer();
+            
         }
     }
 }

From 79ec5b82ad9de5048e27fbeb112d154b0beb9d54 Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Mon, 15 Sep 2025 22:46:10 -0400
Subject: [PATCH 08/13] Part 3 scan finished

---
 src/main.cpp                   |  2 +-
 stream_compaction/efficient.cu | 87 +++++++++++++++++++++-------------
 stream_compaction/naive.cu     |  7 +--
 3 files changed, 59 insertions(+), 37 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 3d5c8820..ae898a69 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 28; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index b7df71ec..611c1c5b 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,79 +12,100 @@ namespace StreamCompaction {
             return timer;
         }
         
-        __device__ void upSweep(int index, int* data, int log2Ceil)
+        __global__ void kernUpSweep(int n, int* data, int d)
         {
-            
-            for (int d = 0; d < log2Ceil; d++)
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index < n)
             {
-                if ((index + 1) % (int)powf(2, d + 1))
+                //Back when d was 0, 1, 2, 3...
+                /*
+                if (index % (int)powf(2, d + 1) == 0)
                 {
-                    data[index] += data[index - (int)powf(2, d)];
+                    data[index + (int)powf(2, d + 1) - 1] += data[index + (int)powf(2, d) - 1];
+
                 }
-                __syncthreads();
-            }
-            
-        }
-        __device__ void downSweep(int index, int* data, int log2Ceil)
-        {
-            
-            
-            for (int d = log2Ceil; d >= 0; d--)
-            {
-                if ((index + 1) % (int)powf(2, d + 1))
+                */
+                if (index % (d * 2) == 0)
                 {
-                    int t = data[index + (int)powf(2, d) - 1];
-                    data[index + (int)powf(2, d) - 1] = data[index + (int)powf(2, d + 1)];
-                    data[index + (int)powf(2, d + 1)] += t;
+                    data[index + d * 2 - 1] += data[index + d - 1];
+
                 }
-                __syncthreads();
+                
             }
-
         }
-
-        __global__ void kernEfficientScan(int n, int* data, int log2Ceil)
+        __global__ void kernDownSweep(int n, int* data, int d)
         {
             int index = threadIdx.x + blockDim.x * blockIdx.x;
             if (index < n)
             {
-                
-                upSweep(index, data, log2Ceil);
-                if (index == n - 1)
+                //Back when d was 0, 1, 2, 3...
+                /*
+                if (index  % (int)powf(2, d + 1) == 0)
                 {
-                    data[index] = 0;
+                    int t = data[index + (int)powf(2, d) - 1];
+                    data[index + (int)powf(2, d) - 1] = data[index + (int)powf(2, d + 1) - 1];
+                    data[index + (int)powf(2, d + 1) - 1] += t;
+                }
+                */
+                if (index % (d * 2) == 0)
+                {
+                    int t = data[index + d - 1];
+                    data[index + d - 1] = data[index + d * 2 - 1];
+                    data[index + d * 2 - 1] += t;
                 }
-                downSweep(index, data, log2Ceil);
             }
+
         }
 
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
            
             // TODO
+            
             int* dev_arr;
 
             int log2Ceil = ilog2ceil(n);
-            int arraySize = powf(2, log2Ceil);
+            int arraySize = 1 << log2Ceil;
+            printf("%d\n", arraySize);
 
             cudaMalloc((void**)&dev_arr, sizeof(int) * arraySize);
+            checkCUDAError("Bad Malloc");
+           
             //Make sure that the array is filled with 0s to start with
             cudaMemset(dev_arr, 0, sizeof(int) * arraySize);
+            checkCUDAError("Bad memset");
             cudaMemcpy(dev_arr, idata, sizeof(int) * arraySize, cudaMemcpyHostToDevice);
-
+            checkCUDAError("Bad copy of initial data");
+            
             int threadsPerBlock = 128;
             dim3 totalBlocks((arraySize + threadsPerBlock - 1) / threadsPerBlock);
 
             timer().startGpuTimer();
+            for (int d = 1; d < arraySize; d *= 2)
+            {
+                kernUpSweep << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, d);
+                checkCUDAError("up sweep failure");
+            }
+            cudaMemset(dev_arr + (arraySize - 1), 0, sizeof(int));
+            checkCUDAError("Bad zeroing of last element");
 
-            kernEfficientScan << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, log2Ceil);
+            for (int d = arraySize / 2; d > 0; d /= 2)
+            {
+                kernDownSweep << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, d);
+                checkCUDAError("down sweep failure");
+            }
 
+            cudaDeviceSynchronize();
             timer().endGpuTimer();
-
+            
             cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            checkCUDAError("memcpy output failure");
+            
             cudaFree(dev_arr);
-
+            cudaDeviceSynchronize();
             
         }
 
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 7f76d2ee..f902739e 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -21,9 +21,10 @@ namespace StreamCompaction {
             int index = threadIdx.x + blockDim.x * blockIdx.x;
             if (index < n)
             {
-                if (index >= powf(2, stride))
+
+                if (index >= (1 << stride))
                 {
-                    odata[index] = idata[index] + idata[index - (int)powf(2, stride)];
+                    odata[index] = idata[index] + idata[index - (1 << stride)];
                 }
                 else
                 {
@@ -63,7 +64,7 @@ namespace StreamCompaction {
             cudaMemcpy(dev_arrA, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             
 
-            int threadsPerBlock = 128;
+            int threadsPerBlock = 256;
             dim3 totalBlocks ((n + threadsPerBlock - 1) / threadsPerBlock);
 
             int log2Ceil = ilog2ceil(n);

From 6dcd59540c6179de535d4339e71f413e6ed05266 Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Tue, 16 Sep 2025 00:05:20 -0400
Subject: [PATCH 09/13] thrust implemented

---
 src/main.cpp                   |  2 +-
 stream_compaction/efficient.cu | 24 ++++++++++++++++++++++++
 stream_compaction/naive.cu     |  2 +-
 stream_compaction/thrust.cu    | 10 ++++++++++
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index ae898a69..8d642d89 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 28; // feel free to change the size of array
+const int SIZE = 1 << 26; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 611c1c5b..dfd7aa09 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -57,6 +57,30 @@ namespace StreamCompaction {
 
         }
 
+        __global__ void kernMapToBoolean(int n, int* oData, const int* iData)
+        {
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index < n)
+            {
+                int val = iData[index];
+                if (val == 0)
+                {
+                    oData[index] = 0;
+                }
+            }
+        }
+        __global__ void kernScatter(int n, int* oData, const int* iData, const int* boolArray, const int* scannedArray)
+        {
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+            if (index < n)
+            {
+                if (boolArray[index])
+                {
+                    oData[scannedArray[index]]
+                }
+            }
+        }
+
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index f902739e..ec41974d 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -64,7 +64,7 @@ namespace StreamCompaction {
             cudaMemcpy(dev_arrA, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
             
 
-            int threadsPerBlock = 256;
+            int threadsPerBlock = 128;
             dim3 totalBlocks ((n + threadsPerBlock - 1) / threadsPerBlock);
 
             int log2Ceil = ilog2ceil(n);
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e7..63eb32ae 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,21 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+
+            //thrust::device_vector<int> dev_in(idata, n);
+            thrust::device_vector<int> dev_in(idata, idata + n);
+            thrust::device_vector<int> dev_out(n);
+
             timer().startGpuTimer();
+    
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin());
+
             timer().endGpuTimer();
+
+            thrust::copy(dev_out.begin(), dev_out.end(), odata);
         }
     }
 }

From 40d2b513af06adfcb1de96fa95b310ce85aaf1bf Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Tue, 16 Sep 2025 00:33:41 -0400
Subject: [PATCH 10/13] Part 3 compact progress

---
 stream_compaction/efficient.cu | 36 ++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index dfd7aa09..ed3286b8 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -76,7 +76,7 @@ namespace StreamCompaction {
             {
                 if (boolArray[index])
                 {
-                    oData[scannedArray[index]]
+                    oData[scannedArray[index]] = iData[index];
                 }
             }
         }
@@ -143,9 +143,41 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+
+
+            int* dev_in;
+            int* dev_out;
+            int* dev_boolArray;
+            int* dev_scannedArray;
+
+            cudaMalloc((void**)&dev_in, sizeof(int) * n);
+            cudaMalloc((void**)&dev_boolArray, sizeof(int) * n);
+            cudaMalloc((void**)&dev_scannedArray, sizeof(int) * n);
+            cudaMalloc((void**)&dev_out, sizeof(int) * n);
+
+            cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
+            int threadsPerBlock = 128;
+            dim3 totalBlocks((n + threadsPerBlock - 1) / threadsPerBlock);
+
             timer().startGpuTimer();
-            // TODO
+            kernMapToBoolean << <totalBlocks, threadsPerBlock >> > (n, dev_boolArray, dev_in);
+            scan(n, dev_scannedArray, dev_boolArray);
+
+            
+
+            //int* count = 0;
+            //cudaMemcpy(count, dev_scannedArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+
+            kernScatter << <totalBlocks, threadsPerBlock >> > (n, dev_out, dev_in, dev_boolArray, dev_scannedArray);
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_out, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+            cudaFree(dev_boolArray);
+            cudaFree(dev_scannedArray);
+            
             return -1;
         }
     }

From 39e04ec123e71dd11a076c425bb1fa12b85f0c6c Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Tue, 16 Sep 2025 18:52:34 -0400
Subject: [PATCH 11/13] Part 3 stream compaction complete

---
 src/main.cpp                   |  2 +-
 stream_compaction/efficient.cu | 57 +++++++++++++++++++++++++---------
 stream_compaction/efficient.h  |  2 +-
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/src/main.cpp b/src/main.cpp
index 8d642d89..ae898a69 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 26; // feel free to change the size of array
+const int SIZE = 1 << 28; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index ed3286b8..ea4bb18a 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -63,9 +63,9 @@ namespace StreamCompaction {
             if (index < n)
             {
                 int val = iData[index];
-                if (val == 0)
+                if (val != 0)
                 {
-                    oData[index] = 0;
+                    oData[index] = 1;
                 }
             }
         }
@@ -85,7 +85,7 @@ namespace StreamCompaction {
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
+        void scan(int n, int *odata, const int *idata, bool forCompact) {
            
             // TODO
             
@@ -93,7 +93,6 @@ namespace StreamCompaction {
 
             int log2Ceil = ilog2ceil(n);
             int arraySize = 1 << log2Ceil;
-            printf("%d\n", arraySize);
 
             cudaMalloc((void**)&dev_arr, sizeof(int) * arraySize);
             checkCUDAError("Bad Malloc");
@@ -101,13 +100,22 @@ namespace StreamCompaction {
             //Make sure that the array is filled with 0s to start with
             cudaMemset(dev_arr, 0, sizeof(int) * arraySize);
             checkCUDAError("Bad memset");
-            cudaMemcpy(dev_arr, idata, sizeof(int) * arraySize, cudaMemcpyHostToDevice);
+            if (!forCompact)
+            {
+                cudaMemcpy(dev_arr, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            }
+            else
+            {
+                cudaMemcpy(dev_arr, idata, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+            }
             checkCUDAError("Bad copy of initial data");
             
             int threadsPerBlock = 128;
             dim3 totalBlocks((arraySize + threadsPerBlock - 1) / threadsPerBlock);
-
-            timer().startGpuTimer();
+            if (!forCompact)
+            {
+                timer().startGpuTimer();
+            }
             for (int d = 1; d < arraySize; d *= 2)
             {
                 kernUpSweep << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, d);
@@ -123,11 +131,19 @@ namespace StreamCompaction {
             }
 
             cudaDeviceSynchronize();
-            timer().endGpuTimer();
-            
-            cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost);
+            if (!forCompact)
+            {
+                timer().endGpuTimer();
+                cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToHost);
+                
+            }
+            else
+            {
+                cudaMemcpy(odata, dev_arr, sizeof(int) * n, cudaMemcpyDeviceToDevice);
+            }
             checkCUDAError("memcpy output failure");
             
+            
             cudaFree(dev_arr);
             cudaDeviceSynchronize();
             
@@ -144,7 +160,7 @@ namespace StreamCompaction {
          */
         int compact(int n, int *odata, const int *idata) {
 
-
+            
             int* dev_in;
             int* dev_out;
             int* dev_boolArray;
@@ -154,20 +170,30 @@ namespace StreamCompaction {
             cudaMalloc((void**)&dev_boolArray, sizeof(int) * n);
             cudaMalloc((void**)&dev_scannedArray, sizeof(int) * n);
             cudaMalloc((void**)&dev_out, sizeof(int) * n);
+            checkCUDAError("Bad Malloc");
+
 
+            cudaMemset(dev_boolArray, 0, sizeof(int) * n);
+            checkCUDAError("Bad Memset");
             cudaMemcpy(dev_in, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+            checkCUDAError("Bad Memcpy");
 
             int threadsPerBlock = 128;
             dim3 totalBlocks((n + threadsPerBlock - 1) / threadsPerBlock);
 
             timer().startGpuTimer();
             kernMapToBoolean << <totalBlocks, threadsPerBlock >> > (n, dev_boolArray, dev_in);
-            scan(n, dev_scannedArray, dev_boolArray);
+
+
+            scan(n, dev_scannedArray, dev_boolArray, true);
 
             
 
-            //int* count = 0;
-            //cudaMemcpy(count, dev_scannedArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            int count;
+            int lastElement;
+            cudaMemcpy(&count, dev_scannedArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastElement, dev_boolArray + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            count += lastElement;
 
             kernScatter << <totalBlocks, threadsPerBlock >> > (n, dev_out, dev_in, dev_boolArray, dev_scannedArray);
             timer().endGpuTimer();
@@ -178,7 +204,8 @@ namespace StreamCompaction {
             cudaFree(dev_boolArray);
             cudaFree(dev_scannedArray);
             
-            return -1;
+            
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4fe..8b77b78e 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,7 +6,7 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
-        void scan(int n, int *odata, const int *idata);
+        void scan(int n, int *odata, const int *idata, bool forCompact = false);
 
         int compact(int n, int *odata, const int *idata);
     }

From 4cb6dbb1d562b0281bc3657dcc66bec1101d2d5b Mon Sep 17 00:00:00 2001
From: Peev <ppeev@kite.upenn.edu>
Date: Tue, 16 Sep 2025 22:54:24 -0400
Subject: [PATCH 12/13] Commented out error checking for slightly faster
 runtime

---
 stream_compaction/efficient.cu | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index ea4bb18a..720646fa 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -119,15 +119,15 @@ namespace StreamCompaction {
             for (int d = 1; d < arraySize; d *= 2)
             {
                 kernUpSweep << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, d);
-                checkCUDAError("up sweep failure");
+                //checkCUDAError("up sweep failure");
             }
             cudaMemset(dev_arr + (arraySize - 1), 0, sizeof(int));
-            checkCUDAError("Bad zeroing of last element");
+            //checkCUDAError("Bad zeroing of last element");
 
             for (int d = arraySize / 2; d > 0; d /= 2)
             {
                 kernDownSweep << <totalBlocks, threadsPerBlock >> > (arraySize, dev_arr, d);
-                checkCUDAError("down sweep failure");
+                //checkCUDAError("down sweep failure");
             }
 
             cudaDeviceSynchronize();

From 1dbc099173415f6166bbaff010c49b5fb16c1884 Mon Sep 17 00:00:00 2001
From: Pavel Peev <76268971+thesquashedman@users.noreply.github.com>
Date: Tue, 16 Sep 2025 23:19:31 -0400
Subject: [PATCH 13/13] Update README.md

---
 README.md | 88 ++++++++++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 81 insertions(+), 7 deletions(-)

diff --git a/README.md b/README.md
index 0e38ddb1..735e2ed2 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,88 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+**University of Pennsylvania, CIS 5650: GPU Programming and Architecture,
+Project 2 - CUDA Stream Compaction**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Pavel Peev
+  * [LinkedIn](https://www.linkedin.com/in/pavel-peev-5568561b9/), [personal website](www.cartaphil.com)
+* Tested on: Windows 11, i7-1270, NVIDIA T1000
 
-### (TODO: Your README)
+### Description
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Comparison of implementations of exclusive scans (and implementations of compact for the CPU and the work effient scan), comparing a basic CPU implementation, a naive CUDA implementation which adds an element stride spaces to the right over log2(n) kernal calls, the work efficient implmentation using an upsweep and downsweep stage to create a balanced binary tree, and thrust's own implementation of an exclusive scan.
+
+### Sample Output
+Below is an example of the output from running the test with array size 2^28
+```
+****************
+** SCAN TESTS **
+****************
+    [  45  15  18  24  10  15  38  38  47  12   8  39  45 ...  11   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 421.69ms    (std::chrono Measured)
+    [   0  45  60  78 102 112 127 165 203 250 262 270 309 ... -2015394307 -2015394296 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 419.546ms    (std::chrono Measured)
+    [   0  45  60  78 102 112 127 165 203 250 262 270 309 ... -2015394410 -2015394369 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 520.167ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 519.373ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 392.432ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 392.508ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 16.1508ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 17.0412ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   3   2   1   3   0   2   1   0   0   0   3   3   2 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 570.409ms    (std::chrono Measured)
+    [   3   2   1   3   2   1   3   3   2   2   2   3   2 ...   3   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 576.547ms    (std::chrono Measured)
+    [   3   2   1   3   2   1   3   3   2   2   2   3   2 ...   1   3 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 1481.78ms    (std::chrono Measured)
+    [   3   2   1   3   2   1   3   3   2   2   2   3   2 ...   3   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 2605.5ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 2604.63ms    (CUDA Measured)
+    passed
+```
+### Performance
+
+<img width="1152" height="768" alt="Line Chart" src="https://github.com/user-attachments/assets/05998878-880b-41b3-86be-ba80e3f76e53" />
+
+
+
+### Performance Analysis
+Blocks of size 128 were used, as that seemed to perform the best across all the algorithms.
+
+The efficient implementation performs marginally better than it's naive and cpu counterparts.The naive implementation performed the worst, which makes sense do to it's nlog2(n) complexity and the high cost for invoking kernal calls, making it slower than a regular CPU implemtation (and potentially the use of the older NVIDIA T1000 GPU). The thrust implementation performs significantly better than the others, showing just how much the exclusive scan algorithm can be optimized.
+
+
+Analyzing the kernal invocations within NSight Compute (shown below), it becomes clear that the efficient implementation can be improved upon. Both the upsweep and downsweep have low memory throughput. Using shared memory, we can load the memory coherently for each of the blocks, which should reduce the amount of memory loads needed with the current algorithm.
+
+<img width="1530" height="264" alt="image" src="https://github.com/user-attachments/assets/b4360172-0e8a-4a1a-b1df-0f3cabface03" />
+
+I also use a modulo operator within the kernal, which is a well known expensive operation which could be replaced. Also, it is very likely that the warps diverge, with some warps having only 1 thread doing any work. With some clever indexing, this could be optimized so that all the working threads fall into the same warps, allowing for the warps with non working threads to retire early.