diff --git a/README.md b/README.md
index 0e38ddb..ec19c21 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,104 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Zhiyu Lei
+  * [LinkedIn](https://www.linkedin.com/in/zhiyu-lei/), [Github](https://github.com/Zhiyu-Lei)
+* Tested on: Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (CETS Virtual Lab)
 
-### (TODO: Your README)
+### Project Description
+* CPU Scan & Stream Compaction & Quick Sort ([stream_compaction/cpu.cu](stream_compaction/cpu.cu))
+* Naive GPU Scan Algorithm ([stream_compaction/naive.cu](stream_compaction/naive.cu))
+* Work-Efficient GPU Scan & Stream Compaction([stream_compaction/efficient.cu](stream_compaction/efficient.cu))
+* Using Thrust's Implementation ([stream_compaction/thrust.cu](stream_compaction/thrust.cu))
+* Radix Sort ([stream_compaction/radix_sort.cu](stream_compaction/radix_sort.cu))
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Performance Analysis
+#### Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU.
+The following table shows a comparison of run time (in milliseconds) between various block sizes for each of the implementations. The run time is measured by scanning an array of size $2^{20}$. The block size does not affect performance very significantly, but a block size of 128 seems to be optimal.
+block size|naive scan|work-efficient scan|thrust scan
+:---:|:---:|:---:|:---:
+64|1.6761|3.0861|0.1686
+128|1.5749|1.9997|0.1480
+256|1.8605|2.1077|0.1639
+512|1.6586|2.5638|0.1679
 
+#### Compare all of these GPU Scan implementations to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis).
+![](img/README/time-size.png)
+With a smaller array size, CPU scan is faster than GPU scan; but with a larger array size, GPU scan, especially Thrust's implementation, tends to be faster, and work-efficient scan also becomes faster than naive scan. Theoretically, GPU scan algorithms' run time increases logarithmically against the array size, but the plot does not show any sublinear trend.
+
+#### Write a brief explanation of the phenomena you see here.
+Since I implemented both naive and work-efficient scan algorithms using global memory, the performance bottlenecks were mainly memory I/O. Accessing to global memory is more costly than accessing to shared memory. As for Thrust's implementation, the Nsight timeline shows the occupancy is full, so it tends to use the computability as much as possible.
+
+#### Test Program Output
+Array size is $2^{20}$, and array values are in range $[0,1000)$. Radix sort tests are added.
+```
+****************
+** SCAN TESTS **
+****************
+    [ 559 897 331 240 911 774 261 359 471 923 455 970 436 ... 674   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 1.7442ms    (std::chrono Measured)
+    [   0 559 1456 1787 2027 2938 3712 3973 4332 4803 5726 6181 7151 ... 521313475 521314149 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 1.7567ms    (std::chrono Measured)
+    [   0 559 1456 1787 2027 2938 3712 3973 4332 4803 5726 6181 7151 ... 521311914 521312911 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 1.56285ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 1.55731ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 1.99274ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 1.99523ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 0.187808ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 0.166112ms    (CUDA Measured)
+    passed
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [ 559 897 331 240 911 774 261 359 471 923 455 970 436 ... 674   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 0.3507ms    (std::chrono Measured)
+    [ 559 897 331 240 911 774 261 359 471 923 455 970 436 ... 356 674 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 0.4523ms    (std::chrono Measured)
+    [ 559 897 331 240 911 774 261 359 471 923 455 970 436 ... 997 208 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 3.6566ms    (std::chrono Measured)
+    [ 559 897 331 240 911 774 261 359 471 923 455 970 436 ... 356 674 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 2.19942ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 4.09008ms    (CUDA Measured)
+    passed
+
+**********************
+** RADIX SORT TESTS **
+**********************
+    [ 559 897 331 240 911 774 261 359 471 923 455 970 436 ... 674   0 ]
+==== cpu sort, power-of-two ====
+   elapsed time: 50.9862ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ... 999 999 ]
+==== radix sort, power-of-two ====
+   elapsed time: 74.2602ms    (CUDA Measured)
+    passed
+==== cpu sort, non-power-of-two ====
+   elapsed time: 53.0439ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ... 999 999 ]
+==== radix sort, non-power-of-two ====
+   elapsed time: 71.4663ms    (CUDA Measured)
+    passed
+```
\ No newline at end of file
diff --git a/img/README/time-size.png b/img/README/time-size.png
new file mode 100644
index 0000000..e56dbe4
Binary files /dev/null and b/img/README/time-size.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..24f5248 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -11,9 +11,10 @@
 #include <stream_compaction/naive.h>
 #include <stream_compaction/efficient.h>
 #include <stream_compaction/thrust.h>
+#include <stream_compaction/radix_sort.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 20; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -27,7 +28,7 @@ int main(int argc, char* argv[]) {
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 1000);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -102,7 +103,7 @@ int main(int argc, char* argv[]) {
 
     // Compaction tests
 
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
+    genArray(SIZE - 1, a, 1000);  // Leave a 0 at the end to test that edge case
     a[SIZE - 1] = 0;
     printArray(SIZE, a, true);
 
@@ -147,6 +148,43 @@ int main(int argc, char* argv[]) {
     //printArray(count, c, true);
     printCmpLenResult(count, expectedNPOT, b, c);
 
+    printf("\n");
+    printf("**********************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("**********************\n");
+
+    // Radix sort tests
+
+    genArray(SIZE - 1, a, 1000);  // Leave a 0 at the end to test that edge case
+    a[SIZE - 1] = 0;
+    printArray(SIZE, a, true);
+
+    zeroArray(SIZE, b);
+    printDesc("cpu sort, power-of-two");
+    StreamCompaction::CPU::sort(SIZE, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printArray(SIZE, b, true);
+
+    zeroArray(SIZE, c);
+    printDesc("radix sort, power-of-two");
+    StreamCompaction::RadixSort::sort(SIZE, c, a);
+    printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, b);
+    printDesc("cpu sort, non-power-of-two");
+    StreamCompaction::CPU::sort(NPOT, b, a);
+    printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
+    printArray(NPOT, b, true);
+
+    zeroArray(SIZE, c);
+    printDesc("radix sort, non-power-of-two");
+    StreamCompaction::RadixSort::sort(NPOT, c, a);
+    printElapsedTime(StreamCompaction::RadixSort::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(NPOT, b, c);
+
     system("pause"); // stop Win32 console from closing on exit
     delete[] a;
     delete[] b;
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 567795b..7b34ba9 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -4,6 +4,7 @@ set(headers
     "naive.h"
     "efficient.h"
     "thrust.h"
+    "radix_sort.h"
     )
 
 set(sources
@@ -12,6 +13,7 @@ set(sources
     "naive.cu"
     "efficient.cu"
     "thrust.cu"
+    "radix_sort.cu"
     )
 
 list(SORT headers)
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..f1dcf67 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,10 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                bools[index] = idata[index] ? 1 : 0;
+            }
         }
 
         /**
@@ -32,7 +35,10 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n && bools[index]) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..eb1daf2 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -12,6 +12,7 @@
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
+#define blockSize 128
 
 /**
  * Check for CUDA errors; print and exit if there was a problem.
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..e349862 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+            }
             timer().endCpuTimer();
         }
 
@@ -30,9 +33,14 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int idx = 0;
+            for (int i = 0; i < n; i++) {
+                if (idata[i]) {
+                    odata[idx++] = idata[i];
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return idx;
         }
 
         /**
@@ -42,9 +50,36 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int* bools = new int[n];
+            for (int i = 0; i < n; i++) {
+                bools[i] = idata[i] ? 1 : 0;
+            }
+            int* indices = new int[n];
+            indices[0] = 0;
+            for (int i = 1; i < n; i++) {
+                indices[i] = indices[i - 1] + bools[i - 1];
+            }
+            for (int i = 0; i < n; i++) {
+                if (bools[i]) {
+                    odata[indices[i]] = idata[i];
+                }
+            }
+            int count = bools[n - 1] + indices[n - 1];
+            delete[] bools;
+            delete[] indices;
+            timer().endCpuTimer();
+            return count;
+        }
+        
+        int compare(const void *a, const void *b) {
+            return (*(int*)a - *(int*)b);
+        }
+
+        void sort(int n, int *odata, const int *idata) {
+            timer().startCpuTimer();
+            memcpy(odata, idata, n * sizeof(int));
+            qsort(odata, n, sizeof(int), compare);
             timer().endCpuTimer();
-            return -1;
         }
     }
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h
index 873c047..ffabe81 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.h
@@ -11,5 +11,7 @@ namespace StreamCompaction {
         int compactWithoutScan(int n, int *odata, const int *idata);
 
         int compactWithScan(int n, int *odata, const int *idata);
+
+        void sort(int n, int *odata, const int *idata);
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..c68501c 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,67 @@ namespace StreamCompaction {
             return timer;
         }
 
+        __global__ void kernUpSweep(int n, int *data, int step) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n && index % step == 0) {
+                data[index + step - 1] += data[index + (step >> 1) - 1];
+            }
+        }
+
+        __global__ void kernSetLastZero(int n, int *data) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index == n - 1) {
+                data[index] = 0;
+            }
+        }
+
+        __global__ void kernDownSweep(int n, int *data, int step) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n && index % step == 0) {
+                int idx1 = index + (step >> 1) - 1, idx2 = index + step - 1;
+                int t = data[idx1];
+                data[idx1] = data[idx2];
+                data[idx2] += t;
+            }
+        }
+
+        __global__ void kernCount(int n, int *count, const int *idata, const int *indices) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index == n - 1) {
+                count[0] = indices[index] + (idata[index] ? 1 : 0);
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int *dev_data;
+            int rounds = ilog2ceil(n);
+            int size = 1 << rounds;
+            cudaMalloc((void**)&dev_data, size * sizeof(int));
+            checkCUDAError("cudaMalloc dev_data failed!");
+            cudaMemcpy(dev_data, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            dim3 blocks((n + blockSize - 1) / blockSize);
+            
             timer().startGpuTimer();
-            // TODO
+            int step = 2;
+            for (int i = 0; i < rounds; i++) {
+                kernUpSweep<<<blocks, blockSize>>>(size, dev_data, step);
+                checkCUDAError("kernUpSweep failed!");
+                step <<= 1;
+            }
+            kernSetLastZero<<<blocks, blockSize>>>(size, dev_data);
+            checkCUDAError("kernSetLastZero failed!");
+            step = size;
+            for (int i = 0; i < rounds; i++) {
+                kernDownSweep<<<blocks, blockSize>>>(size, dev_data, step);
+                checkCUDAError("kernDownSweep failed!");
+                step >>= 1;
+            }
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_data, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_data);
         }
 
         /**
@@ -31,10 +85,49 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int *dev_odata, *dev_idata, *dev_indices, *dev_count;
+            int rounds = ilog2ceil(n);
+            int size = 1 << rounds;
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMalloc((void**)&dev_indices, size * sizeof(int));
+            checkCUDAError("cudaMalloc dev_indices failed!");
+            cudaMalloc((void**)&dev_count, sizeof(int));
+            checkCUDAError("cudaMalloc dev_count failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            dim3 blocks((n + blockSize - 1) / blockSize);
+
             timer().startGpuTimer();
-            // TODO
+            StreamCompaction::Common::kernMapToBoolean<<<blocks, blockSize>>>(n, dev_indices, dev_idata);
+            checkCUDAError("kernMapToBoolean failed!");
+            int step = 2;
+            for (int i = 0; i < rounds; i++) {
+                kernUpSweep<<<blocks, blockSize>>>(size, dev_indices, step);
+                checkCUDAError("kernUpSweep failed!");
+                step <<= 1;
+            }
+            kernSetLastZero<<<blocks, blockSize>>>(size, dev_indices);
+            checkCUDAError("kernSetLastZero failed!");
+            step = size;
+            for (int i = 0; i < rounds; i++) {
+                kernDownSweep<<<blocks, blockSize>>>(size, dev_indices, step);
+                checkCUDAError("kernDownSweep failed!");
+                step >>= 1;
+            }
+            StreamCompaction::Common::kernScatter<<<blocks, blockSize>>>(n, dev_odata, dev_idata, dev_idata, dev_indices);
+            kernCount<<<blocks, blockSize>>>(n, dev_count, dev_idata, dev_indices);
+            checkCUDAError("kernCount failed!");
             timer().endGpuTimer();
-            return -1;
+            int count;
+            cudaMemcpy(&count, dev_count, sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(odata, dev_odata, count * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_odata);
+            cudaFree(dev_idata);
+            cudaFree(dev_indices);
+            cudaFree(dev_count);
+            return count;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..7e1e231 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,48 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        __global__ void kernScan(int n, int *odata, const int *idata, int offset) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                odata[index] = idata[index] + (index >= offset ? idata[index - offset] : 0);
+            }
+        }
+
+        __global__ void kernShift(int n, int *odata, const int *idata) {
+            int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                odata[index] = index ? idata[index - 1] : 0;
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int *dev_odata, *dev_idata;
+            int rounds = ilog2ceil(n);
+            cudaMalloc((void**)&dev_odata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_odata failed!");
+            cudaMalloc((void**)&dev_idata, n * sizeof(int));
+            checkCUDAError("cudaMalloc dev_idata failed!");
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            dim3 blocks((n + blockSize - 1) / blockSize);
+
             timer().startGpuTimer();
-            // TODO
+            int offset = 1;
+            for (int i = 0; i < rounds; i++) {
+                kernScan<<<blocks, blockSize>>>(n, dev_odata, dev_idata, offset);
+                checkCUDAError("kernScan failed!");
+                std::swap(dev_odata, dev_idata);
+                offset <<= 1;
+            }
+            kernShift<<<blocks, blockSize>>>(n, dev_odata, dev_idata);
+            checkCUDAError("kernShift failed!");
             timer().endGpuTimer();
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            cudaFree(dev_odata);
+            cudaFree(dev_idata);
         }
     }
 }
diff --git a/stream_compaction/radix_sort.cu b/stream_compaction/radix_sort.cu
new file mode 100644
index 0000000..2d1d5d1
--- /dev/null
+++ b/stream_compaction/radix_sort.cu
@@ -0,0 +1,124 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include "common.h"
+#include "efficient.h"
+
+namespace StreamCompaction {
+	namespace RadixSort {
+		using StreamCompaction::Common::PerformanceTimer;
+		PerformanceTimer& timer()
+		{
+			static PerformanceTimer timer;
+			return timer;
+		}
+
+		__global__ void kernInitEF(int n, int *e, int *f, const int *idata, int bit) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n) {
+				e[index] = (idata[index] & (1 << bit)) ? 0 : 1;
+				f[index] = e[index];
+			}
+		}
+
+		__global__ void kernUpSweep(int n, int *data, int step) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n && index % step == 0) {
+				data[index + step - 1] += data[index + (step >> 1) - 1];
+			}
+		}
+
+		__global__ void kernSetLastZero(int n, int *data) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index == n - 1) {
+				data[index] = 0;
+			}
+		}
+
+		__global__ void kernDownSweep(int n, int *data, int step) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n && index % step == 0) {
+				int idx1 = index + (step >> 1) - 1, idx2 = index + step - 1;
+				int t = data[idx1];
+				data[idx1] = data[idx2];
+				data[idx2] += t;
+			}
+		}
+
+		__global__ void kernTotalFalses(int n, int *count, const int *e, const int *f) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index == n - 1) {
+				count[0] = e[index] + f[index];
+			}
+		}
+
+		__global__ void kernComputeT(int n, int *t, const int *f, const int *count) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n) {
+				t[index] = index - f[index] + count[0];
+			}
+		}
+
+		__global__ void kernScatterOut(int n, int *odata, const int *idata, const int *e, const int *f, const int *t) {
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+			if (index < n) {
+				int d = e[index] ? f[index] : t[index];
+				odata[d] = idata[index];
+			}
+		}
+
+		void sort(int n, int *odata, const int *idata) {
+			int *dev_odata, *dev_idata, *dev_e, *dev_f, *dev_t, *dev_count;
+			int rounds = ilog2ceil(n);
+			int size = 1 << rounds;
+			cudaMalloc((void**)&dev_odata, n * sizeof(int));
+			checkCUDAError("cudaMalloc dev_odata failed!");
+			cudaMalloc((void**)&dev_idata, n * sizeof(int));
+			checkCUDAError("cudaMalloc dev_idata failed!");
+			cudaMalloc((void**)&dev_e, n * sizeof(int));
+			checkCUDAError("cudaMalloc dev_e failed!");
+			cudaMalloc((void**)&dev_f, size * sizeof(int));
+			checkCUDAError("cudaMalloc dev_f failed!");
+			cudaMalloc((void**)&dev_t, n * sizeof(int));
+			checkCUDAError("cudaMalloc dev_t failed!");
+			cudaMalloc((void**)&dev_count, sizeof(int));
+			checkCUDAError("cudaMalloc dev_count failed!");
+			cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+			dim3 blocks((n + blockSize - 1) / blockSize);
+
+			timer().startGpuTimer();
+			for (int bit = 0; bit < 31; bit++) {
+				kernInitEF<<<blocks, blockSize>>>(n, dev_e, dev_f, dev_idata, bit);
+				checkCUDAError("kernInitEF failed!");
+				int step = 2;
+				for (int i = 0; i < rounds; i++) {
+					kernUpSweep<<<blocks, blockSize>>>(size, dev_f, step);
+					checkCUDAError("kernUpSweep failed!");
+					step <<= 1;
+				}
+				kernSetLastZero<<<blocks, blockSize>>>(size, dev_f);
+				checkCUDAError("kernSetLastZero failed!");
+				step = size;
+				for (int i = 0; i < rounds; i++) {
+					kernDownSweep<<<blocks, blockSize>>>(size, dev_f, step);
+					checkCUDAError("kernDownSweep failed!");
+					step >>= 1;
+				}
+				kernTotalFalses<<<blocks, blockSize>>>(n, dev_count, dev_e, dev_f);
+				checkCUDAError("kernTotalFalses failed!");
+				kernComputeT<<<blocks, blockSize>>>(n, dev_t, dev_f, dev_count);
+				checkCUDAError("kernComputeT failed!");
+				kernScatterOut<<<blocks, blockSize>>>(n, dev_odata, dev_idata, dev_e, dev_f, dev_t);
+				checkCUDAError("kernScatterOut failed!");
+				std::swap(dev_odata, dev_idata);
+			}
+			timer().endGpuTimer();
+			cudaMemcpy(odata, dev_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(dev_odata);
+			cudaFree(dev_idata);
+			cudaFree(dev_e);
+			cudaFree(dev_f);
+			cudaFree(dev_t);
+			cudaFree(dev_count);
+		}
+	}
+}
diff --git a/stream_compaction/radix_sort.h b/stream_compaction/radix_sort.h
new file mode 100644
index 0000000..27baf3f
--- /dev/null
+++ b/stream_compaction/radix_sort.h
@@ -0,0 +1,11 @@
+#pragma once
+
+#include "common.h"
+
+namespace StreamCompaction {
+	namespace RadixSort {
+		StreamCompaction::Common::PerformanceTimer& timer();
+
+		void sort(int n, int *odata, const int *idata);
+	}
+}
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..312a84a 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,12 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::device_vector<int> dv_in(idata, idata + n);
+            thrust::device_vector<int> dv_out(n);
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
             timer().endGpuTimer();
+            thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }