diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 0000000..cad7657
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,3 @@
+{
+    "cmake.configureOnOpen": false
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 0e38ddb..ec5925e 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,345 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yinuo (Travis) Xie
+  * [LinkedIn](https://www.linkedin.com/in/yinuotxie/)
+* Tested on: Windows 10, 12th Gen Intel(R) Core(TM) i7-12700 @ 2.10 GHz, 16GB, NVIDIA GeForce RTX 3060 Ti (Personal Computer)
 
-### (TODO: Your README)
+# CUDA Scan  & Stream Compaction
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+## Table of Contents
+- [CUDA Stream Compaction](#cuda-stream-compaction)
+- [CUDA Scan  \& Stream Compaction](#cuda-scan---stream-compaction)
+  - [Table of Contents](#table-of-contents)
+  - [Overview](#overview)
+  - [Prefix Sum Scan](#prefix-sum-scan)
+    - [What is Prefix Sum Scan?](#what-is-prefix-sum-scan)
+    - [Implementation](#implementation)
+      - [CPU Scan](#cpu-scan)
+      - [Naive GPU Scan](#naive-gpu-scan)
+      - [Work-Efficient GPU Scan](#work-efficient-gpu-scan)
+        - [Up-Sweep](#up-sweep)
+        - [Down-Sweep](#down-sweep)
+        - [Dynamic Grid Sizing in Work-Efficient Scan](#dynamic-grid-sizing-in-work-efficient-scan)
+      - [Work-Efficient GPU Scan with Shared Memory](#work-efficient-gpu-scan-with-shared-memory)
+      - [Thrust Scan](#thrust-scan)
+  - [Stream Compaction](#stream-compaction)
+    - [What is Stream Compaction?](#what-is-stream-compaction)
+    - [Implementation](#implementation-1)
+      - [CPU Compact without Scan](#cpu-compact-without-scan)
+      - [CPU Compact with Scan](#cpu-compact-with-scan)
+      - [Work-Efficient Compact](#work-efficient-compact)
+  - [Performance Analysis](#performance-analysis)
+    - [Performance Analysis on Different Scan Algorithms](#performance-analysis-on-different-scan-algorithms)
+      - [Observations](#observations)
+    - [Performance Analysis on Different Block Sizes](#performance-analysis-on-different-block-sizes)
+      - [Observations](#observations-1)
+    - [Performance Analysis on Different Compact Algorithms](#performance-analysis-on-different-compact-algorithms)
+      - [Observations](#observations-2)
+    - [Nsight Timeline for Thrust Scan](#nsight-timeline-for-thrust-scan)
+  - [Blooper](#blooper)
+  - [Output](#output)
 
+## Overview
+
+This project presents various algorithms tailored to handle two essential operations using CUDA: prefix sum scan and stream compaction.
+
+## Prefix Sum Scan
+
+### What is Prefix Sum Scan?
+
+Imagine you're working with a digital painting program. Each brush stroke you make on the canvas is recorded in a list, and each stroke has a certain amount of memory it occupies. The prefix sum scan is like calculating the total memory used up to each brush stroke. This is helpful because if you want to 'undo' or 'replay' your strokes up to a certain point, the program knows exactly how much memory it needs to access. This technique isn't only about recording brush strokes; it's a cornerstone in computer graphics, aiding in tasks such as determining how much memory to allocate for rendering scenes.
+
+To illustrate the prefix sum scan, consider the following array:
+
+```
+[3, 1, 7, 0, 4, 1, 6, 3]
+```
+
+The inclusive prefix sum scan of this array is:
+
+```
+[0, 3, 4, 11, 11, 15, 16, 22]
+```
+
+The exclusive prefix sum scan of this array is:
+```
+[3, 4, 11, 11, 15, 16, 22, 25]
+```
+
+### Implementation
+
+The prefix sum scan implemented in this project is an exclusive scan. This means that the result at each position in the output array does not include the input value at that position. The project encompasses five different approaches:
+
+* CPU Scan
+* Naive GPU Scan
+* Work-Efficient GPU Scan
+* Work-Efficient GPU Scan with Shared Memory
+* Thrust Scan
+
+**Note**: For all the scans, the input array size is assumed to be a power of 2. If not, the array is padded with zeros to the nearest power of 2.
+
+#### CPU Scan
+
+The CPU scan is realized within the `StreamCompaction::CPU::scan` function. It is based on a straightforward for-loop that iterates through the array, adding the value of the preceding element to the current one. The initial element is preset to 0.
+
+#### Naive GPU Scan
+
+In the Naive GPU scan `StreamCompaction::Naive::scan`, parallelism is harnessed to speed up the prefix sum calculation. Instead of processing elements one by one, the algorithm leverages the GPU's capability to process multiple elements simultaneously, reducing the time required to compute the scan. The images below illustrate the process of the naive GPU scan.
+![Naive GPU Scan](img/figure-39-2.jpg)
+
+#### Work-Efficient GPU Scan
+
+The Work-Efficient GPU Scan `StreamCompaction:Efficient::scan` is optimized to use fewer operations than the naive approach. By organizing the computations in a specific structure and eliminating redundant calculations, this method can achieve faster computation times on the GPU. The work-efficient GPU scan involves two pivotal steps:
+
+* Up-Sweep (Parallel Reduction)
+* Down-Sweep 
+
+##### Up-Sweep
+
+The Up-Sweep phase, also known as Parallel Reduction, is the first step of the Work-Efficient scan. In this phase, the algorithm works from the bottom of the data structure upwards, combining pairs of elements and propagating their sums up the tree. The outcome of this step is a balanced binary tree with partial sums, with the total sum of all elements at the root. The image below illustrates the process of the up-sweep.
+![Up-Sweep](img/up_sweep.png)
+
+##### Down-Sweep
+
+The Down-Sweep phase begins from the top of the constructed tree and works its way downwards. Starting with the root set to zero, the algorithm traverses the tree, swapping and accumulating values to generate the prefix sum for each element. The result of this phase is the desired prefix sum array, constructed in a more efficient manner than the naive approach. The image below illustrates the process of the down-sweep.
+![Down-Sweep](img/down_sweep.png)
+
+##### Dynamic Grid Sizing in Work-Efficient Scan
+
+To optimize resource usage and enhance performance, the work-efficient scan incorporates dynamic grid sizing. This adaptive approach ensures that the number of blocks needed is determined at each level of computation.
+
+As the algorithm ascends the tree, there's a noticeable reduction in the number of elements participating in calculations. This decrement in element count implies that fewer blocks are needed to handle the processing. Consequently, by dynamically adjusting the grid size based on the current computation level, the algorithm ensures efficient utilization of GPU resources, thereby maximizing the occupancy ratio. This strategy not only improves computational speed but also ensures optimal resource usage throughout the scanning process.
+
+#### Work-Efficient GPU Scan with Shared Memory
+
+Building upon the Work-Efficient GPU Scan, the `StreamCompaction::Efficient::scanShared` approach taps into shared memory to yield enhanced performance. Shared memory in GPUs offers a speed advantage over global memory, making data fetches quicker when the required data resides locally.
+
+The process to exploit this memory advantage is multi-faceted:
+
+1. **Block Scan**: Initially, the algorithm executes a block scan on each block within the array. Here, the prefix sum is computed for elements within individual blocks using shared memory, ensuring faster data access and computations.
+
+2. **Storing Block Sums**: After computing the prefix sums for individual blocks, the algorithm saves the total sum of each block into an array called `blockSums`.
+
+3. **Scanning Block Sums**: A prefix sum scan is then performed on the `blockSums` array. Interestingly, this step can recursively apply `scanShared` if the `blockSums` array is large. This recursive strategy takes advantage of shared memory at multiple levels, optimizing the computation further. However, due to simplicity, this project did not implement the recursive approach.
+
+4. **Adding Block Sums to Elements**: Finally, the corresponding block sum from the `blockSums` array is added to every element within a block. This step integrates the computed offsets, resulting in the final prefix sum array for the entire dataset.
+
+**Example**:
+Imagine an array: `[3, 1, 7, 0, 4, 1, 6, 3]`. When broken into blocks of size 4:
+* Block 1: `[3, 1, 7, 0]`
+* Block 2: `[4, 1, 6, 3]`
+
+After block scan, we may get:
+* Block 1: `[0, 3, 4, 11]`
+* Block 2: `[0, 4, 5, 11]`
+
+The `blockSums` array will be: `[11, 11]`. Applying `scanShared` on `blockSums` gives `[0, 11]`.
+
+Finally, adding block sums to elements:
+* Block 1: remains `[0, 3, 4, 11]`
+* Block 2: `[11, 15, 16, 22]`
+
+The final output array becomes: `[0, 3, 4, 11, 11, 15, 16, 22]`.
+
+By dividing the task into blocks and using shared memory to handle intra-block computations, `StreamCompaction::Efficient::scanShared` manages to achieve faster computation times and more efficient memory usage, especially for large array size.
+
+
+#### Thrust Scan
+
+[Thrust](https://docs.nvidia.com/cuda/thrust/index.html) is a CUDA library that offers a collection of efficient parallel primitives. In this project, the Thrust library's built-in scan function is used. It provides a highly optimized and robust implementation of the prefix sum scan, ensuring both speed and accuracy.
+
+## Stream Compaction
+
+### What is Stream Compaction?
+
+Consider a 3D scene in a video game with various objects: trees, buildings, characters, and some 'invisible' objects that the player shouldn't see. Stream compaction is akin to a smart camera that only focuses on the visible objects and ignores the invisible ones, thus only rendering what's essential. In our project, we likened this to removing zeros from a list, similar to how the camera filters out unimportant details. Such an approach is crucial in computer graphics to ensure scenes are rendered efficiently, providing smoother gameplay or more detailed visual effects. It is also useful in applications like path tracing, collision detection, and sparse matrix compression.
+
+**Example**:
+Consider an array: `[3, 0, 1, 0, 0, 2, 0, 0]`. The stream compacted array is `[3, 1, 2]`.
+
+### Implementation
+
+The stream compaction implemented in this project encompasses three approaches:
+
+* CPU Compact without Scan
+* CPU Compact with Scan
+* Work-Efficient Compact
+
+#### CPU Compact without Scan
+
+The CPU compact without scan, represented by the function `StreamCompaction::CPU::compactWithoutScan`, adopts a direct method. It marches through the input array, cherry-picking non-zero elements to append to the output array. An internal counter keeps tabs on the number of non-zero elements, which ultimately dictates the dimensions of the output array. Below is a visual representation of the CPU compact without scan process.
+![CPU Compact without Scan](img/cpu_compact.png)
+
+#### CPU Compact with Scan
+
+For `StreamCompaction::CPU::compactWithScan`, the approach is more intricate yet optimized. The methodology unfolds in three primary stages:
+
+1. **Boolean Mask Creation**: A boolean array is first generated. Each slot in this array is tagged with a '1' if the corresponding element in the input array isn't zero. Otherwise, it's marked with a '0'.
+2. **Applying CPU Scan**: This boolean array then undergoes a CPU scan, which churns out the indices for the non-zero elements.
+3. **Population of Output Array**: Using the indices harvested from the scan, non-zero elements are transposed to the output array in their appropriate order.
+
+The accompanying image furnishes a clearer picture of the CPU compact with scan modus operandi.
+![CPU Compact with Scan](img/compact_scan.png)
+
+#### Work-Efficient Compact
+
+The `StreamCompaction::Efficient::compact` function champions a work-efficient approach. This method is designed to harness the power of GPU's parallel processing. The most important steps in the work-efficient compact is the scan, which is the same as the work-efficient scan described above. 
+
+## Performance Analysis
+
+The project conducts a performance analysis on various scan and compact algorithms. For each algorithm, tests are run 10 times, and the average time is computed using `std::chrono` for CPU high-precision timing and CUDA events for measuring GPU performance. The influence of different block sizes on the most efficient scan algorithm is also explored. To glean insights into the inner workings of the thrust scan, the Nvidia Nsight timeline was consulted.
+
+### Performance Analysis on Different Scan Algorithms
+
+Performance evaluations were executed on varying array sizes with a block size of 128. The graph below depicts the performance metrics for each scan algorithm against different array sizes.
+
+![Scan Performance](img/scan_analysis.png)
+
+#### Observations
+
+* **CPU Scan**: As array size augments, there's an evident exponential surge in execution duration. The principal bottleneck here, especially for large size, is memory I/O.
+
+* **Naive Scan**: Stable performance is observed for small sizes, but there's a pronounced increase post $2^{16}$. The GPU's inherent parallelism masks some inefficiencies for smaller datasets, but they become beneficial as data scales.
+
+* **Work-Efficient Scan**: Performance deterioration is evident post $2^{18}$. Despite its design focus on minimizing operations, the degradation indicates other factors (potentially memory I/O) coming into play for large size.
+
+* **Work-Efficient Scan with Shared Memory**: This method exhibits unstable performance across varying sizes, indicating a sensitivity to data layout and memory access patterns. As data size increases, the shared memory approach might be encountering bank conflicts, a typical GPU challenge.
+
+* **Thrust Scan**: Performance remains relatively steadfast until around $2^{18}$, post which it ascends. This trend, even in an optimized library like Thrust, underscores the challenges tied to extensive data sizes.
+
+A pervasive observation is the sharp escalation in execution time for all scan methodologies post $2^{18}$. This is a strong indicator that memory I/O is emerging as a universal bottleneck for all scan strategies.
+
+### Performance Analysis on Different Block Sizes
+
+We tested how block size affects performance using an array size of $2^{20}$. The graph below shows the results for each scan method with different block sizes.
+
+![Block Size Performance](img/blocksize.png)
+
+#### Observations
+
+* **Work-Efficient Scan**: Performance drops as block size goes above 128. This suggests that there might be more overhead or less optimal memory access patterns with larger block sizes.
+  
+* **Work-Efficient Shared Memory Scan**: The best results are seen around block size 128, with performance dropping for smaller and larger block sizes. This suggests that block size 128 is the sweet spot for making the best use of shared memory on the GPU without causing too much contention or overhead.
+  
+* **Thrust Scan**: The performance doesn't change much with different block sizes, but there's a slight increase in time for larger blocks. Thrust, being a well-optimized library, seems to handle different block sizes well. However, it's not completely immune to the effects of block size changes.
+  
+### Performance Analysis on Different Compact Algorithms
+
+We tested how different compact algorithms perform with various array sizes, using a block size of 128. The graph below showcases the results for each compact method with different array sizes.
+
+![Compact Performance](img/compact_analysis.png)
+
+#### Observations
+
+* **CPU Compact Without/With Scan**: Both these methods take more time as the array size gets bigger, showing a pattern similar to the CPU Scan. This points towards memory access becoming a major slowdown for CPU-based methods as the data size increases.
+
+* **Work-Efficient Compact**: For smaller data sizes up to $2^{18}$, this method performs consistently. However, beyond that size, it starts to slow down, likely due to the same memory access issues as its work-efficient scan.
+
+### Nsight Timeline for Thrust Scan
+
+To dive deeper into the workings of the thrust scan, we examined its behavior using the Nsight timeline. The image below presents the timeline for thrust scan when processing an array of size $2^{20}$.
+
+![Nsight Timeline](img/thrust.jpg)
+
+A few key takeaways from the graph:
+
+* **Memory Allocation Dominates**: A significant chunk of time, roughly 90%, is consumed by `cudaMemcpyAsync`. This indicates that memory transfers, possibly between the CPU and GPU, are the primary time consumers.
+* **Kernel Execution is Swift**: The actual computation, `DeviceScanKernel`, is remarkably quick, clocking in at just about 0.05ms.
+
+These observations underscore the fact that memory access, rather than computation, is the limiting factor for the thrust scan's performance. This aligns with our earlier performance analysis, reinforcing the understanding that efficient memory management is crucial for optimizing scan operations, especially on GPUs.
+
+## Blooper
+
+While working on this project, I faced an unexpected issue that's worth highlighting. Both the work-efficient scan and its variant using shared memory started acting up for array sizes beyond $2^{24}$.
+
+For array sizes that are powers of 2, the work-efficient scan would yield an array filled entirely with zeros. On the other hand, for non-power-of-2 sizes, an "illegal memory access" error would be thrown during the `kernUpSweep` kernel execution.
+
+Upon seeking guidance on ED, a suggested remedy was to transition the index type from `int` to `size_t` within the kernel. This modification rectified the issue.
+
+**Wrong code**:
+
+```cpp
+int index = threadIdx.x + (blockIdx.x * blockDim.x);
+``` 
+
+**Correct code**:
+```cpp
+size_t index = threadIdx.x + (blockIdx.x * blockDim.x);
+```
+
+Nevertheless, the root cause remains somewhat elusive. Intuitively, an int should comfortably support values up to $2^{32}$. Considering my implementation operates seamlessly up to an array size of $2^{24}$, I'm inclined to believe the kernel isn't the problem. Yet, I can't say that with complete certainty. If anyone has faced a similar issue or has insights into this, I'd greatly appreciate any feedback or solutions. It's crucial to understand such nuances, especially when working with GPU architectures and CUDA programming.
+
+## Output
+
+Below is the output of the program when run with an array size of $2^{20}$ and a block size of 128.
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  34   1  31  46  32  26  13  22  47  36  32  28   3 ...  18   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 1.6893ms    (std::chrono Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686188 25686206 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 1.7212ms    (std::chrono Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686115 25686116 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 1.1121ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686188 25686206 ]
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 0.505792ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ...   0   0 ]
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 0.423616ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686188 25686206 ]
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 0.429856ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686115 25686116 ]
+    passed
+==== work-efficient scan with shared memory, power-of-two ====
+   elapsed time: 0.16224ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686188 25686206 ]
+    passed
+==== work-efficient scan with shared memory, non-power-of-two ====
+   elapsed time: 0.402304ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686115 25686116 ]
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.483872ms    (CUDA Measured)
+    [   0  34  35  66 112 144 170 183 205 252 288 320 348 ... 25686188 25686206 ]
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.332ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   1   0   0   1   1   0   1   0   0   0   2   0 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 2.179ms    (std::chrono Measured)
+    [   1   1   1   1   1   2   2   2   1   3   3   2   2 ...   2   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 2.102ms    (std::chrono Measured)
+    [   1   1   1   1   1   2   2   2   1   3   3   2   2 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 5.1676ms    (std::chrono Measured)
+    [   1   1   1   1   1   2   2   2   1   3   3   2   2 ...   2   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 0.564288ms    (CUDA Measured)
+    [   1   1   1   1   1   2   2   2   1   3   3   2   2 ...   2   3 ]
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 0.512416ms    (CUDA Measured)
+    [   1   1   1   1   1   2   2   2   1   3   3   2   2 ...   3   2 ]
+    passed
+```
diff --git a/img/blocksize.png b/img/blocksize.png
new file mode 100644
index 0000000..7d64206
Binary files /dev/null and b/img/blocksize.png differ
diff --git a/img/compact_analysis.png b/img/compact_analysis.png
new file mode 100644
index 0000000..3f78740
Binary files /dev/null and b/img/compact_analysis.png differ
diff --git a/img/compact_scan.png b/img/compact_scan.png
new file mode 100644
index 0000000..b4bd5e0
Binary files /dev/null and b/img/compact_scan.png differ
diff --git a/img/cpu_compact.png b/img/cpu_compact.png
new file mode 100644
index 0000000..b483bed
Binary files /dev/null and b/img/cpu_compact.png differ
diff --git a/img/down_sweep.png b/img/down_sweep.png
new file mode 100644
index 0000000..7ed46d5
Binary files /dev/null and b/img/down_sweep.png differ
diff --git a/img/scan_analysis.png b/img/scan_analysis.png
new file mode 100644
index 0000000..9129a88
Binary files /dev/null and b/img/scan_analysis.png differ
diff --git a/img/thrust.jpg b/img/thrust.jpg
new file mode 100644
index 0000000..55d3067
Binary files /dev/null and b/img/thrust.jpg differ
diff --git a/img/up_sweep.png b/img/up_sweep.png
new file mode 100644
index 0000000..cebc554
Binary files /dev/null and b/img/up_sweep.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..3866a6a 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -7,18 +7,23 @@
  */
 
 #include <cstdio>
+#include <vector>
+#include <numeric>
 #include <stream_compaction/cpu.h>
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+int* a = new int[SIZE];
+int* b = new int[SIZE];
+int* c = new int[SIZE];
 
+#define TEST 1 // 1: running test; 0: perform anaylsis
+
+#if TEST
 int main(int argc, char* argv[]) {
     // Scan tests
 
@@ -30,6 +35,7 @@ int main(int argc, char* argv[]) {
     genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
+    
 
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
@@ -51,41 +57,55 @@ int main(int argc, char* argv[]) {
     printDesc("naive scan, power-of-two");
     StreamCompaction::Naive::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
-    /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
-    onesArray(SIZE, c);
+    // For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
+    /*onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
     StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    printArray(SIZE, c, true);*/
 
     zeroArray(SIZE, c);
     printDesc("naive scan, non-power-of-two");
     StreamCompaction::Naive::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, power-of-two");
     StreamCompaction::Efficient::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
     StreamCompaction::Efficient::scan(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
+    printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient scan with shared memory, power-of-two");
+    StreamCompaction::Efficient::scanShared(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("work-efficient scan with shared memory, non-power-of-two");
+    StreamCompaction::Efficient::scanShared(NPOT, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
     StreamCompaction::Thrust::scan(SIZE, c, a);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
+    printArray(SIZE, c, true);
     printCmpResult(SIZE, b, c);
 
     zeroArray(SIZE, c);
@@ -137,14 +157,14 @@ int main(int argc, char* argv[]) {
     printDesc("work-efficient compact, power-of-two");
     count = StreamCompaction::Efficient::compact(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedCount, b, c);
 
     zeroArray(SIZE, c);
     printDesc("work-efficient compact, non-power-of-two");
     count = StreamCompaction::Efficient::compact(NPOT, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
+    printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
     system("pause"); // stop Win32 console from closing on exit
@@ -152,3 +172,193 @@ int main(int argc, char* argv[]) {
     delete[] b;
     delete[] c;
 }
+
+#else
+const int NUM_TESTS = 10;
+
+double computeAverage(double* arr, int size) {
+    double sum = 0.0;
+    for (int i = 0; i < size; ++i) {
+        sum += arr[i];
+    }
+    return sum / size;
+}
+
+void testScan() {
+    // Scan tests
+
+    printf("\n");
+    printf("****************\n");
+    printf("** SCAN TESTS **\n");
+    printf("****************\n");
+
+    double* cpuScanTimes = new double[NUM_TESTS];
+    double* naiveScanTimes = new double[NUM_TESTS];
+    double* workEfficientScanTimes = new double[NUM_TESTS];
+    double* workEfficientScansharedTimes = new double[NUM_TESTS];
+    double* thrustScanTimes = new double[NUM_TESTS];
+
+    for (int i = 0; i < NUM_TESTS; ++i) {
+        genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+        a[SIZE - 1] = 0;
+
+        zeroArray(SIZE, b);
+        // printDesc("cpu scan, power-of-two");
+        StreamCompaction::CPU::scan(SIZE, b, a);
+        cpuScanTimes[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+        
+        zeroArray(SIZE, c);
+        // printDesc("cpu scan, not power-of-two");
+        StreamCompaction::CPU::scan(NPOT, c, a);
+        evalCmpResult(NPOT, b, c);
+
+        zeroArray(SIZE, c);
+        StreamCompaction::Naive::scan(SIZE, c, a);
+        naiveScanTimes[i] = StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation();
+        evalCmpResult(SIZE, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("naive scan, not power-of-two");
+        StreamCompaction::Naive::scan(NPOT, c, a);
+        evalCmpResult(NPOT, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("work-efficient scan, power-of-two");
+        StreamCompaction::Efficient::scan(SIZE, c, a);
+        workEfficientScanTimes[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+        evalCmpResult(SIZE, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("work-efficient scan, not power-of-two");
+        StreamCompaction::Efficient::scan(NPOT, c, a);
+        evalCmpResult(NPOT, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("work-efficient scan with shared memory, power-of-two");
+        StreamCompaction::Efficient::scanShared(SIZE, c, a);
+        workEfficientScansharedTimes[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+        evalCmpResult(SIZE, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("work-efficient scan with shared memory, not power-of-two");
+        StreamCompaction::Efficient::scanShared(NPOT, c, a);
+        evalCmpResult(NPOT, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("thrust scan, power-of-two");
+        StreamCompaction::Thrust::scan(SIZE, c, a);
+        thrustScanTimes[i] = StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation();
+        evalCmpResult(SIZE, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("thrust scan, not power-of-two");
+        StreamCompaction::Thrust::scan(NPOT, c, a);
+        evalCmpResult(NPOT, b, c);
+    }
+
+    printDesc("cpu scan, power-of-two");
+    printDoubleArray(NUM_TESTS, cpuScanTimes, true);
+    printf("%5f \n", computeAverage(cpuScanTimes, NUM_TESTS));
+
+    printDesc("naive scan,power-of-two");
+    printDoubleArray(NUM_TESTS, naiveScanTimes, true);
+    printf("%5f \n", computeAverage(naiveScanTimes, NUM_TESTS));
+
+    printDesc("work-efficient scan, power-of-two");
+    printDoubleArray(NUM_TESTS, workEfficientScanTimes, true);
+    printf("%5f \n", computeAverage(workEfficientScanTimes, NUM_TESTS));
+
+    printDesc("work-efficient scan with shared memory, power-of-two");
+    printDoubleArray(NUM_TESTS, workEfficientScansharedTimes, true);
+    printf("%5f \n", computeAverage(workEfficientScansharedTimes, NUM_TESTS));
+
+    printDesc("thrust scan, power-of-two");
+    printDoubleArray(NUM_TESTS, thrustScanTimes, true);
+    printf("%5f \n", computeAverage(thrustScanTimes, NUM_TESTS));
+
+   
+    delete[] cpuScanTimes;
+    delete[] naiveScanTimes;
+    delete[] workEfficientScanTimes;
+    delete[] thrustScanTimes;
+}
+
+void testCompact() {
+    printf("\n");
+    printf("*****************************\n");
+    printf("** STREAM COMPACTION TESTS **\n");
+    printf("*****************************\n");
+
+    // Compaction tests
+
+    double* cpuCompact = new double[NUM_TESTS];
+    double* cpuCompactWithScan = new double[NUM_TESTS];
+    double* workEfficientCompact = new double[NUM_TESTS];
+
+    for (int i = 0; i < NUM_TESTS; ++i) {
+        genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+        a[SIZE - 1] = 0;
+        //printArray(SIZE, a, true);
+
+        int count, expectedCount, expectedNPOT;
+
+        // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
+        // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
+        zeroArray(SIZE, b);
+       //  printDesc("cpu compact without scan, power-of-two");
+        count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+        cpuCompact[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+        expectedCount = count;
+        evalCmpLenResult(count, expectedCount, b, b);
+
+        zeroArray(SIZE, c);
+        // printDesc("cpu compact without scan, non-power-of-two");
+        count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+        expectedNPOT = count;
+        evalCmpLenResult(count, expectedNPOT, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("cpu compact with scan");
+        count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+        cpuCompactWithScan[i] = StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation();
+        evalCmpLenResult(count, expectedCount, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("work-efficient compact, power-of-two");
+        count = StreamCompaction::Efficient::compact(SIZE, c, a);
+        workEfficientCompact[i] = StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation();
+        evalCmpLenResult(count, expectedCount, b, c);
+
+        zeroArray(SIZE, c);
+        // printDesc("work-efficient compact, non-power-of-two");
+        count = StreamCompaction::Efficient::compact(NPOT, c, a);
+        evalCmpLenResult(count, expectedNPOT, b, c);
+    }
+
+    printDesc("cpu compact without scan, power-of-two");
+    printDoubleArray(NUM_TESTS, cpuCompact, true);
+    printf("%5f \n", computeAverage(cpuCompact, NUM_TESTS));
+
+    printDesc("cpu compact with scan, power-of-two");
+    printDoubleArray(NUM_TESTS, cpuCompactWithScan, true);
+    printf("%5f \n", computeAverage(cpuCompactWithScan, NUM_TESTS));
+
+    printDesc("work-efficient compact, power-of-two");
+    printDoubleArray(NUM_TESTS, workEfficientCompact, true);
+    printf("%5f \n", computeAverage(workEfficientCompact, NUM_TESTS));
+
+    delete[] cpuCompact;
+    delete[] cpuCompactWithScan;
+    delete[] workEfficientCompact;
+}
+
+int main(int argc, char* argv[]) {
+    testScan();
+    // testCompact();
+
+    system("pause"); // stop Win32 console from closing on exit
+    delete[] a;
+    delete[] b;
+    delete[] c;
+}
+#endif
\ No newline at end of file
diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp
index 025e94a..c28e168 100644
--- a/src/testing_helpers.hpp
+++ b/src/testing_helpers.hpp
@@ -27,6 +27,13 @@ void printCmpResult(int n, T *a, T *b) {
             cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
 }
 
+template<typename T>
+void evalCmpResult(int n, T *a, T *b) {
+    if (cmpArrays(n, a, b)) {
+        printf(" % s \n", "FAIL VALUE");
+    }
+}
+
 template<typename T>
 void printCmpLenResult(int n, int expN, T *a, T *b) {
     if (n != expN) {
@@ -37,6 +44,25 @@ void printCmpLenResult(int n, int expN, T *a, T *b) {
             cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");
 }
 
+template<typename T>
+void evalCmpLenResult(int n, int expN, T* a, T* b) {
+    if (n != expN) {
+        printf("    expected %d elements, got %d\n", expN, n);
+    }
+
+    if (n == -1 || n != expN) {
+        printf(" % s \n", "FAIL COUNT");
+    }
+    else {
+        if (cmpArrays(n, a, b)) {
+            printf(" % s \n", "FAIL VALUE");
+        }
+    }
+   /* printf("    %s \n",
+        (n == -1 || n != expN) ? "FAIL COUNT" :
+        cmpArrays(n, a, b) ? "FAIL VALUE" : "passed");*/
+}
+
 void zeroArray(int n, int *a) {
     for (int i = 0; i < n; i++) {
         a[i] = 0;
@@ -69,6 +95,18 @@ void printArray(int n, int *a, bool abridged = false) {
     printf("]\n");
 }
 
+void printDoubleArray(int n, double* a, bool abridged = false) {
+    printf("    [ ");
+    for (int i = 0; i < n; i++) {
+        if (abridged && i + 2 == 15 && n > 16) {
+            i = n - 2;
+            printf("... ");
+        }
+        printf("%5f ", a[i]);
+    }
+    printf("]\n");
+}
+
 template<typename T>
 void printElapsedTime(T time, std::string note = "")
 {
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..f0974e9 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,11 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+
+            if (index < n) {
+                bools[index] = (idata[index] != 0) ? 1 : 0;
+            }
         }
 
         /**
@@ -32,8 +36,13 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
-        }
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
 
+            if (index < n) {
+                if (bools[index] == 1) {
+                    odata[indices[index]] = idata[index];
+                }
+            }
+        }
     }
 }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..c480bd2 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +33,16 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int index = 0;
+            for (int i = 0; i < n; ++i) {
+                if (idata[i] != 0) {
+                    odata[index] = idata[i];
+                    index++;
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return index;
         }
 
         /**
@@ -42,9 +52,29 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int* bools = new int[n];
+            int* scanResults = new int[n];
+
+            // map the bools array 
+            for (int i = 0; i < n; ++i) {
+                bools[i] = (idata[i] != 0) ? 1 : 0;
+            }
+
+            // run exclusive scan
+            scanResults[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                scanResults[i] = scanResults[i - 1] + bools[i - 1];
+            }
+
+            // scatter
+            for (int i = 0; i < n; ++i) {
+                if (bools[i] != 0) {
+                    odata[scanResults[i]] = idata[i];
+                }
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return scanResults[n - 1] + bools[n - 1];
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..935fd88 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,8 +1,21 @@
 #include <cuda.h>
-#include <cuda_runtime.h>
+#include <cuda_runtime.h> 
 #include "common.h"
 #include "efficient.h"
 
+#define BLOCK_SIZE 128
+#define DOUBLE_BLOCK_SIZE 2 * BLOCK_SIZE
+#define NUM_BANKS 16
+#define LOG_NUM_BANKS 4
+
+#define CONFLICT_FREE_OFFSET(n) \
+    ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS))
+
+
+#define TIME_COMPACT 1
+
+#define checkCUDAErrorWithLine(msg) checkCUDAError(msg, __LINE__)
+
 namespace StreamCompaction {
     namespace Efficient {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,16 +24,195 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
+ 
+        __global__ void kernUpSweep(int n, int d, int* odata) {
+            size_t index = (blockIdx.x * blockDim.x + threadIdx.x) * (2 << d);
+ 
+            if (index >= n) return;
+
+            odata[index + (1 << (d + 1)) - 1] += odata[index + (1 << d) - 1];
+        }
+
+        __global__ void kernDownSweep(int n, int d, int* odata) {
+            size_t index = (blockIdx.x * blockDim.x + threadIdx.x) * (2 << d);
+
+            if (index >= n) return;
+
+            // preserve the left child value
+            int temp = odata[index + (1 << d) - 1];
+            // left child copies the parent value
+            odata[index + (1 << d) - 1] = odata[index + (1 << (d + 1)) - 1];
+            // right child addes the parent value and the preserved left child value
+            odata[index + (1 << (d + 1)) - 1] += temp;
+        }
+
+         /**
+          *  apply shared memory to scan each block
+          */
+        __global__ void kernBlockScan(int n, int* odata, const int* idata, int* blockSums) {
+            extern __shared__ int temp[];
+
+            int thid = threadIdx.x;
+            size_t index = blockIdx.x * blockDim.x + thid;
+
+            // Load input into shared memory with boundary checks
+            temp[2 * thid] = (2 * index < n) ? idata[2 * index] : 0;
+            temp[2 * thid + 1] = (2 * index + 1 < n) ? idata[2 * index + 1] : 0;
+            __syncthreads();
+
+            int offset = 1;
+           
+            // Up-sweep (reduce) phase
+            for (int d = blockDim.x; d > 0; d >>= 1) {
+                __syncthreads();
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    temp[bi] += temp[ai];
+                }
+                offset *= 2;
+            }
+
+            // Clear last element
+            if (thid == 0) {
+                blockSums[blockIdx.x] = temp[2 * blockDim.x - 1];
+                temp[2 * blockDim.x - 1] = 0;
+            }
+
+            // Down-sweep phase
+            for (int d = 1; d < 2 * blockDim.x; d *= 2) {
+                offset >>= 1;
+                __syncthreads();
+                if (thid < d) {
+                    int ai = offset * (2 * thid + 1) - 1;
+                    int bi = offset * (2 * thid + 2) - 1;
+                    int t = temp[ai];
+                    temp[ai] = temp[bi];
+                    temp[bi] += t;
+                }
+            }
+            __syncthreads();
+
+            // Write results to device memory with boundary checks
+            if (2 * index < n) {
+                odata[2 * index] = temp[2 * thid];
+            }
+
+            if (2 * index + 1 < n) {
+                odata[2 * index + 1] = temp[2 * thid + 1];
+            }
+        }
+
+        __global__ void kernAddScannedBlockSums(int n, int* odata, const int* blockSums) {
+            size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+            int blockSum = (blockIdx.x > 0) ? blockSums[blockIdx.x - 1] : 0;
+
+            if (2 * index < n) {
+                odata[2 * index] += blockSum;
+            }
+
+            if (2 * index + 1 < n) {
+                odata[2 * index + 1] += blockSum;
+            }
+        }
+
+        /**
+         * Performs prefix-sum (aks scan) on idata using the shared memory, storing the result into odata
+         */
+        void scanShared(int n, int* odata, const int* idata) {
+            int* dev_in, * dev_out, * dev_blockSums;
+            
+            const int log2ceil = ilog2ceil(n);
+            const int fullSize = 1 << log2ceil;
+
+            int gridSize = (fullSize + DOUBLE_BLOCK_SIZE - 1) / (DOUBLE_BLOCK_SIZE);
+            // printf("gridSize: %d\n", gridSize);
+
+            // allocate gpu memory
+            cudaMalloc((void**)&dev_in, fullSize * sizeof(int));
+            cudaMemset(dev_in, 0, fullSize * sizeof(int));
+            cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+            cudaMalloc((void**)&dev_out, n * sizeof(int));
+
+            cudaMalloc((void**)&dev_blockSums, gridSize * sizeof(int));
+            checkCUDAErrorFn("malloc dev_blockSums failed!");
+
+            timer().startGpuTimer();
+            kernBlockScan << <gridSize, BLOCK_SIZE, DOUBLE_BLOCK_SIZE * sizeof(int) >> > (fullSize, dev_out, dev_in, dev_blockSums);
+            checkCUDAErrorFn("blockScan failed!");
+
+            int* blockSums = new int[gridSize];
+            cudaMemcpy(blockSums, dev_blockSums, gridSize * sizeof(int), cudaMemcpyDeviceToHost);
+   
+            for (int i = 1; i < gridSize; ++i) {
+                blockSums[i] += blockSums[i - 1];
+            }
+
+            cudaMemcpy(dev_blockSums, blockSums, gridSize * sizeof(int), cudaMemcpyHostToDevice);
+            delete[] blockSums;
+
+            kernAddScannedBlockSums << <gridSize, BLOCK_SIZE >> > (fullSize, dev_out, dev_blockSums);
+
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+            cudaFree(dev_blockSums);
+        }
+
+        void scanHelper(int n, int log2ceil, int* dev_out) {
+            // up sweep 
+            for (int d = 0; d <= log2ceil - 1; ++d) {
+                // Adjust the grid size based on the depth of the sweep
+                int gridSize = (n / (2 << d) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernUpSweep << <gridSize, BLOCK_SIZE >> > (n, d, dev_out);
+                checkCUDAErrorFn("up sweep failed!");
+            }
+
+            // set the last value to 0
+            cudaMemset(dev_out + n - 1, 0, sizeof(int));
+            checkCUDAErrorWithLine("set the last value to zero failed!");
+
+            // down sweep
+            for (int d = log2ceil - 1; d >= 0; --d) {
+                // Adjust the grid size based on the depth of the sweep
+                int gridSize = (n / (2 << d) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernDownSweep << <gridSize, BLOCK_SIZE >> > (n, d, dev_out);
+                checkCUDAErrorFn("down sweep failed");
+            }
+
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_out;
+
+            const int log2ceil = ilog2ceil(n);
+            const long int fullSize = 1 << log2ceil;
+
+            cudaMalloc((void**)&dev_out, fullSize * sizeof(int));
+            cudaMemset(dev_out, 0, fullSize * sizeof(int));
+            cudaMemcpy(dev_out, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+
+            scanHelper(fullSize, log2ceil, dev_out);
+
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(dev_out);
         }
 
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -30,11 +222,92 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int* odata, const int* idata) {
+            int* dev_in, * dev_out, * dev_bools, * dev_scan;
+
+            int boolLastVal, scanLastVal;
+
+            int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+            cudaMalloc((void**)&dev_in, n * sizeof(int));
+            checkCUDAErrorFn("malloc dev_in failed!");
+            cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorFn("copy idata to dev_in failed!");
+
+            cudaMalloc((void**)&dev_out, n * sizeof(int));
+            checkCUDAErrorFn("malloc dev_out failed!");
+
+            cudaMalloc((void**)&dev_bools, n * sizeof(int));
+            checkCUDAErrorFn("malloc dev_bools failed!");
+
+#if TIME_COMPACT
+            const int log2ceil = ilog2ceil(n);
+            const long int fullSize = 1 << log2ceil;
+
+            cudaMalloc((void**)&dev_scan, fullSize * sizeof(int));
+            checkCUDAErrorFn("malloc dev_scan failed!");
+            cudaMemset(dev_scan, 0, n * sizeof(int));
+#else
+            cudaMalloc((void**)&dev_scan, n * sizeof(int));
+            checkCUDAErrorFn("malloc dev_scan failed!");
+#endif
+
+#if TIME_COMPACT
             timer().startGpuTimer();
-            // TODO
+#endif
+            // map the bool array
+            StreamCompaction::Common::kernMapToBoolean << <gridSize, BLOCK_SIZE >> > (n, dev_bools, dev_in);
+            checkCUDAErrorFn("map bool array failed!");
+
+            
+#if TIME_COMPACT
+            // scan the bool array
+            cudaMemcpy(dev_scan, dev_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+
+            // up sweep
+            for (int d = 0; d <= log2ceil - 1; ++d) {
+                int dynamicGridSize = (fullSize / (2 << d) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernUpSweep << <dynamicGridSize, BLOCK_SIZE >> > (fullSize, d, dev_scan);
+                checkCUDAErrorFn("up sweep failed!");
+            }
+
+            // set the last value to 0
+            cudaMemset(dev_scan + fullSize - 1, 0, sizeof(int));
+            
+            // down sweep
+            for (int d = log2ceil - 1; d >= 0; --d) {
+                int dynamicGridSize = (fullSize / (2 << d) + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernDownSweep << <dynamicGridSize, BLOCK_SIZE >> > (fullSize, d, dev_scan);
+                checkCUDAErrorFn("down sweep failed");
+            }
+#else 
+            // scan(n, dev_scan, dev_bools);
+            scanShared(n, dev_scan, dev_bools);
+#endif
+            // scatter
+            StreamCompaction::Common::kernScatter << <gridSize, BLOCK_SIZE >> > (n, dev_out, dev_in, dev_bools, dev_scan);
+            checkCUDAErrorFn("scatter failed!");
+#if TIME_COMPACT
             timer().endGpuTimer();
-            return -1;
+#endif
+            // store the last value of the bool array
+            cudaMemcpy(&boolLastVal, dev_bools + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorFn("copy last bool value to host failed!");
+
+            // store the last value of the scan results
+            cudaMemcpy(&scanLastVal, dev_scan + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorFn("copy last bool value to host failed!");
+
+            cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorFn("copy dev_out to odata failed!");
+
+            // free memory
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+            cudaFree(dev_bools);
+            cudaFree(dev_scan);
+
+            return scanLastVal + boolLastVal;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..ba37d8b 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -6,8 +6,12 @@ namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
+        void scanHelper(int n, int log2ceil, int* dev_out);
+
         void scan(int n, int *odata, const int *idata);
 
+        void scanShared(int n, int* odata, const int* idata);
+
         int compact(int n, int *odata, const int *idata);
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..cab33d5 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -3,6 +3,8 @@
 #include "common.h"
 #include "naive.h"
 
+#define BLOCK_SIZE 128
+
 namespace StreamCompaction {
     namespace Naive {
         using StreamCompaction::Common::PerformanceTimer;
@@ -11,15 +13,64 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        __global__ void kernNaiveScan(int n, int d, int *odata, const int *idata) {
+            int index = threadIdx.x + blockDim.x * blockIdx.x;
+
+            if (index >= n) {
+                return;
+            }
+
+            if (index >= (1 << (d - 1))) {
+                odata[index] = idata[index - (1 << (d - 1))] + idata[index];
+            }
+            else {
+                odata[index] = idata[index];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_in;
+            int* dev_out;
+
+            int gridSize = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+            // allocate memory
+            cudaMalloc((void**)&dev_in, n * sizeof(int));
+            checkCUDAErrorFn("malloc dev_in failed!");
+            cudaMalloc((void**)&dev_out, n * sizeof(int));
+            checkCUDAErrorFn("malloc dev_out failed!");
+
+            // populate dev_in
+            cudaMemcpy(dev_in, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAErrorFn("copy to dev_in failed!");
+
             timer().startGpuTimer();
-            // TODO
+            for (int d = 1; d <= ilog2ceil(n); ++d) {
+                kernNaiveScan << <gridSize, BLOCK_SIZE >> > (n, d, dev_out, dev_in);
+                checkCUDAErrorFn("kernNaiveScan failed!");
+
+                std::swap(dev_in, dev_out);
+            }
             timer().endGpuTimer();
+
+            // shift to exclusive
+            int zero = 0;
+            cudaMemcpy(&dev_out[0], &zero, 1 * sizeof(int), cudaMemcpyHostToDevice);
+            cudaMemcpy(&dev_out[1], dev_in, (n - 1) * sizeof(int), cudaMemcpyDeviceToDevice);
+            checkCUDAErrorFn("shift failed!");
+
+            cudaMemcpy(odata, dev_out, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAErrorFn("cudaMemcpy to odata failed!");
+
+            // free cuda memory
+            cudaFree(dev_in);
+            cudaFree(dev_out);
+
+            cudaDeviceSynchronize();
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..be47ae8 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,16 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::host_vector<int> host_in(idata, idata + n);
+
+            thrust::device_vector<int> dev_in = host_in;
+            thrust::device_vector<int> dev_out(n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dev_in.begin(), dev_in.end(), dev_out.begin());
             timer().endGpuTimer();
+
+            thrust::copy(dev_out.begin(), dev_out.end(), odata);
         }
     }
 }