diff --git a/README.md b/README.md
index 0e38ddb1..68c297b2 100644
--- a/README.md
+++ b/README.md
@@ -3,12 +3,130 @@ CUDA Stream Compaction
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Hongyi Ding
+  * [LinkedIn](https://www.linkedin.com/in/hongyi-ding/), [personal website](https://johnnyding.com/)
+* Tested on: Windows 11, i7-12700 @ 2.10GHz 32GB, NVIDIA T1000 4GB (SEAS Virtual Lab)
 
-### (TODO: Your README)
+### Description
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+In this project, we implemented scan & stream compaction algorithms in different ways. We intend to compare the performance of different implementations, including cpu scan & compation, naive gpu scan, and scanning using the thrust library.
 
+### Performance Analysis
+
+This table shows the time needed for scanning over the number of numbers using different ways. The time is recorded in ms.
+
+| N    | CPU     | Naive GPU | Efficient GPU | thrust   |
+| ---- | ------- | --------- | ------------- | -------- |
+| 2^10 | 0.0016  | 1.68691   | 1.38016       | 0.981888 |
+| 2^15 | 0.054   | 1.84899   | 1.37459       | 1.06662  |
+| 2^20 | 1.7093  | 4.20902   | 2.68237       | 1.34141  |
+| 2^25 | 52.4132 | 86.1861   | 27.3095       | 3.1232   |
+| 2^26 | 105.806 | 171.778   | 52.3393       | 5.23088  |
+| 2^27 | 206.849 | 353.931   | 102.907       | 8.9111   |
+| 2^28 | 424.237 | 703.445   | 204.032       | 16.5718  |
+| 2^29 | 816.657 | 3192.93   | 407.458       | 78.364   |
+| 2^30 | 1662.63 | 14334     | 2523.06       | 404.013  |
+
+This table shows the time needed for compaction over the number of numbers using different ways. The time is recorded in ms.
+
+| N    | CPU without scan | CPU with scan | Efficient GPU |
+| ---- | ---------------- | ------------- | ------------- |
+| 2^10 | 0.0025           | 0.0042        | 0.206528      |
+| 2^15 | 0.067            | 0.1381        | 0.471424      |
+| 2^20 | 2.2095           | 4.8973        | 1.34582       |
+| 2^25 | 70.2928          | 156.767       | 33.5974       |
+| 2^26 | 143.941          | 323.207       | 65.5928       |
+| 2^27 | 282.771          | 652.236       | 132.566       |
+| 2^28 | 558.38           | 1376.53       | 281.205       |
+| 2^29 | 1003.12          | 2541.53       | 2543.02       |
+| 2^30 | 2273.97          | 6898.14       | 11972.5       |
+
+### Answer to Questions
+
+* Roughly optimize the block sizes of each of your implementations for minimal
+  run time on your GPU.
+
+  * This is the table of performance of efficient GPU scan over different block sizes
+
+    | Block Size  | 128     | 256     | 512    | 768     | 1024    |
+    | ----------- | ------- | ------- | ------ | ------- | ------- |
+    | Performance | 53.6786 | 52.3393 | 52.817 | 52.2205 | 52.7176 |
+
+    The difference is minor for different block sizes. So we just take the common block size of 256.
+
+* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and
+  Thrust) to the serial CPU version of Scan. Plot a graph of the comparison
+  (with array size on the independent axis).
+
+  * ![scan-performance](pics/scan-performance.png)
+  * ![compaction-performance](pics/compaction-performance.png)
+
+* Write a brief explanation of the phenomena you see here.
+
+  * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is
+    it different for each implementation?
+    * When N is small (`<2^20`), for all 3 GPU implementations, the overhead of initializing memory and transferring data is high. As we can observe in the graph, when `N<2^20`, cpu version has a better performance. So in this situation, the bottleneck is initializing and Host-Device communication.
+    * When N becomes larger (`2^20` to `2^28`), the gpu implementations are much better. The parallelism is making the the algorithm perform better than the single-thread version on gpu.
+    * When N grows very large (`>2^28`), due to limited GPU memory (which is only 4GB on the test machine), the program has to use shared memory, then the memory I/O has a much larger latency. So in this case, all 3 gpu implementations begin to perform worse and worse, even worse than the cpu version.
+
+* Paste the output of the test program into a triple-backtick block in your
+  README.
+
+  * this is the output when `N=1<<26`
+
+    ```
+    ****************
+    ** SCAN TESTS **
+    ****************
+        [   0   9  47  18  42  18  43  43  37  33   6  43  41 ...  15   0 ]
+    ==== cpu scan, power-of-two ====
+       elapsed time: 105.806ms    (std::chrono Measured)
+        [   0   0   9  56  74 116 134 177 220 257 290 296 339 ... 1643626790 1643626805 ]
+    ==== cpu scan, non-power-of-two ====
+       elapsed time: 106.817ms    (std::chrono Measured)
+        [   0   0   9  56  74 116 134 177 220 257 290 296 339 ... 1643626693 1643626732 ]
+        passed
+    ==== naive scan, power-of-two ====
+       elapsed time: 171.778ms    (CUDA Measured)
+        passed
+    ==== naive scan, non-power-of-two ====
+       elapsed time: 160.932ms    (CUDA Measured)
+        passed
+    ==== work-efficient scan, power-of-two ====
+       elapsed time: 52.3393ms    (CUDA Measured)
+        passed
+    ==== work-efficient scan, non-power-of-two ====
+       elapsed time: 50.5345ms    (CUDA Measured)
+        passed
+    ==== thrust scan, power-of-two ====
+       elapsed time: 5.23088ms    (CUDA Measured)
+        passed
+    ==== thrust scan, non-power-of-two ====
+       elapsed time: 4.78698ms    (CUDA Measured)
+        passed
+    
+    *****************************
+    ** STREAM COMPACTION TESTS **
+    *****************************
+        [   2   0   0   1   1   1   2   2   0   1   2   1   2 ...   0   0 ]
+    ==== cpu compact without scan, power-of-two ====
+       elapsed time: 143.941ms    (std::chrono Measured)
+        [   2   1   1   1   2   2   1   2   1   2   1   1   1 ...   3   1 ]
+        passed
+    ==== cpu compact without scan, non-power-of-two ====
+       elapsed time: 139.827ms    (std::chrono Measured)
+        [   2   1   1   1   2   2   1   2   1   2   1   1   1 ...   3   1 ]
+        passed
+    ==== cpu compact with scan ====
+       elapsed time: 323.207ms    (std::chrono Measured)
+        [   2   1   1   1   2   2   1   2   1   2   1   1   1 ...   3   1 ]
+        passed
+    ==== work-efficient compact, power-of-two ====
+       elapsed time: 65.5928ms    (CUDA Measured)
+        passed
+    ==== work-efficient compact, non-power-of-two ====
+       elapsed time: 65.4395ms    (CUDA Measured)
+        passed
+    ```
+
+    
\ No newline at end of file
diff --git a/pics/compaction-echarts.js b/pics/compaction-echarts.js
new file mode 100644
index 00000000..6b24a56e
--- /dev/null
+++ b/pics/compaction-echarts.js
@@ -0,0 +1,25 @@
+const CPU_without_scan = [0.0025, 0.067, 2.2095, 70.2928, 143.941, 282.771, 558.38, 1003.12, 2273.97];
+const CPU_with_scan = [0.0042, 0.1381, 4.8973, 156.767, 323.207, 652.236, 1376.53, 2541.53, 6898.14];
+const Efficient_GPU = [0.206528, 0.471424, 1.34582, 33.5974, 65.5928, 132.566, 281.205, 2543.02, 11972.5];
+
+const N = [10, 15, 20, 25, 26, 27, 28, 29, 30];
+const N_values = N.map(v => v);
+const offset= 3;
+
+function logAndOffset(arr, offset) {
+  return arr.map(v => Math.log10(v) + offset);
+}
+
+option = {
+  title: {
+    text: 'Compaction Performance'
+  },
+  legend: { data: ['CPU_without_scan', 'CPU_with_scan', 'Efficient_GPU'] },
+  xAxis: { type: 'value', name: 'log10(N)', min: 10, max: 30 },
+  yAxis: { type: 'value', name: 'log10(Time)' },
+  series: [
+    { name: 'CPU_without_scan', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(CPU_without_scan, offset)[i]]) },
+    { name: 'CPU_with_scan', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(CPU_with_scan, offset)[i]]) },
+    { name: 'Efficient_GPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(Efficient_GPU, offset)[i]]) },
+  ]
+};
\ No newline at end of file
diff --git a/pics/compaction-performance.png b/pics/compaction-performance.png
new file mode 100644
index 00000000..2ffbe177
Binary files /dev/null and b/pics/compaction-performance.png differ
diff --git a/pics/scan-echarts.js b/pics/scan-echarts.js
new file mode 100644
index 00000000..3a3badac
--- /dev/null
+++ b/pics/scan-echarts.js
@@ -0,0 +1,27 @@
+const CPU = [0.0016, 0.054, 1.7093, 52.4132, 105.806, 206.849, 424.237, 816.657, 1662.63];
+const NaiveGPU = [1.68691, 1.84899, 4.20902, 86.1861, 171.778, 353.931, 703.445, 3192.93, 14334];
+const EfficientGPU = [1.38016, 1.37459, 2.68237, 27.3095, 52.3393, 102.907, 204.032, 407.458, 2523.06];
+const Thrust = [0.981888, 1.06662, 1.34141, 3.1232, 5.23088, 8.9111, 16.5718, 78.364, 404.013];
+
+const N = [10, 15, 20, 25, 26, 27, 28, 29, 30];
+const N_values = N.map(v => v);
+const offset= 3;
+
+function logAndOffset(arr, offset) {
+  return arr.map(v => Math.log10(v) + offset);
+}
+
+option = {
+  title: {
+    text: 'Scan Performance'
+  },
+  legend: { data: ['CPU', 'Naive GPU', 'Efficient GPU', 'Thrust'] },
+  xAxis: { type: 'value', name: 'log10(N)', min: 10, max: 30 },
+  yAxis: { type: 'value', name: 'log10(Time)' },
+  series: [
+    { name: 'CPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(CPU, offset)[i]]) },
+    { name: 'Naive GPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(NaiveGPU, offset)[i]]) },
+    { name: 'Efficient GPU', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(EfficientGPU, offset)[i]]) },
+    { name: 'Thrust', type: 'line', data: N_values.map((x,i) => [x, logAndOffset(Thrust, offset)[i]]) },
+  ]
+};
\ No newline at end of file
diff --git a/pics/scan-performance.png b/pics/scan-performance.png
new file mode 100644
index 00000000..f4ed8dfb
Binary files /dev/null and b/pics/scan-performance.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 3d5c8820..1369212d 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,7 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 25; // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 19511caa..c1a8eab6 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -20,9 +20,11 @@ list(SORT sources)
 source_group(Headers FILES ${headers})
 source_group(Sources FILES ${sources})
 
+find_package(CCCL REQUIRED)
 add_library(stream_compaction ${sources} ${headers})
+target_link_libraries(stream_compaction CCCL::Thrust) 
 if(CMAKE_VERSION VERSION_LESS "3.23.0")
-    set_target_properties(stream_compaction} PROPERTIES CUDA_ARCHITECTURES OFF)
+    set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES OFF)
 elseif(CMAKE_VERSION VERSION_LESS "3.24.0")
     set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major)
 else()
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d630..04c7529f 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -23,7 +23,14 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (idata[index] != 0) {
+                    bools[index] = 1;
+                } else {
+                    bools[index] = 0;
+				}
+            }
         }
 
         /**
@@ -32,7 +39,12 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+			int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+            if (index < n) {
+                if (bools[index] == 1) {
+                    odata[indices[index]] = idata[index];
+                }
+            }
         }
 
     }
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa115..7cbfaa96 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,7 +19,10 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+			odata[0] = 0;
+            for (int i = 1; i < n; i++) {
+                odata[i] = odata[i - 1] + idata[i - 1];
+			}
             timer().endCpuTimer();
         }
 
@@ -30,9 +33,15 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+			int i = 0, j = 0;
+			while (i < n) {
+				if (idata[i] != 0) {
+					odata[j++] = idata[i];
+				}
+                i++;
+			}
             timer().endCpuTimer();
-            return -1;
+            return j;
         }
 
         /**
@@ -41,10 +50,25 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            int* b = new int[n], * sum = new int[n];
             timer().startCpuTimer();
-            // TODO
+			for (int i = 0; i < n; i++) {
+				b[i] = (idata[i] != 0) ? 1 : 0;
+			}
+            sum[0] = 0;
+            for (int i = 1; i < n; i++) {
+                sum[i] = sum[i - 1] + b[i - 1];
+            }
+			int count = (n > 0) ? sum[n - 1] + b[n - 1] : 0;
+			for (int i = 0; i < n; i++) {
+				if (b[i] == 1) {
+					odata[sum[i]] = idata[i];
+				}
+			}
             timer().endCpuTimer();
-            return -1;
+            delete[] b;
+            delete[] sum;
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346ee..be417acb 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -2,6 +2,9 @@
 #include <cuda_runtime.h>
 #include "common.h"
 #include "efficient.h"
+#include <iostream>
+
+#define NUM_PER_BLOCK 1024
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -12,13 +15,116 @@ namespace StreamCompaction {
             return timer;
         }
 
+        int nextPow2(int n) {
+            int ret = 1;
+			while (ret < n) ret <<= 1;
+            return ret;
+        }
+
+        __global__ void scan(int n, int *idata) {
+			__shared__ int temp[NUM_PER_BLOCK];
+            n = NUM_PER_BLOCK;
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+
+            if (index < n) {
+                temp[index] = idata[index];
+            }
+
+            int offset = 1;
+            for (int d = n>>1; d > 0; d >>= 1) {
+                if (index < d) {
+                    int ai = offset * (2 * index + 2) - 1;
+                    int bi = offset * (2 * index + 1) - 1;
+                    if (ai >= n || bi >= n) continue;
+                    temp[ai] += temp[bi];
+                }
+                offset <<= 1;
+                __syncthreads();
+            }
+
+            // clear the last element
+			if (index == 0) {
+                temp[n - 1] = 0;
+            }
+            __syncthreads();
+
+            offset = n >> 1;
+            for (int d = 1; d < n; d <<= 1) {
+                if (index < d) {
+                    int ai = offset * (2 * index + 2) - 1;
+                    int bi = offset * (2 * index + 1) - 1;
+					if (ai >= n || bi >= n) continue;
+                    int t = temp[bi];
+                    temp[bi] = temp[ai];
+                    temp[ai] += t;
+                }
+				offset >>= 1;
+                __syncthreads();
+            }
+
+            if (index < n) {
+                idata[index] = temp[index];
+			}
+        }
+
+        __global__ void upsweep(int n, int* data, int d) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+			int offset = (n >> 1) / d;
+            if (index < d) {
+                int ai = offset * (2 * index + 2) - 1;
+                int bi = offset * (2 * index + 1) - 1;
+                data[ai] += data[bi];
+            }
+        }
+
+        __global__ void downsweep(int n, int* data, int d) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            int offset = (n >> 1) / d;
+            if (index < d) {
+                int ai = offset * (2 * index + 2) - 1;
+                int bi = offset * (2 * index + 1) - 1;
+                int t = data[bi];
+                data[bi] = data[ai];
+                data[ai] += t;
+            }
+        }
+
+        void scanOnDevice(int N, int* d_idata) {
+            int blockSize = 256;
+            int numBlocks = (N + blockSize - 1) / blockSize;
+            for (int d = N >> 1; d > 0; d >>= 1) {
+                upsweep << <(d + blockSize - 1) / blockSize, blockSize >> > (N, d_idata, d);
+            }
+            //cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            //for (int i = 0; i < n; i++) {
+            //	std::cout << odata[i] << " ";
+            //}
+            //std::cout << std::endl;
+            cudaMemset(d_idata + N - 1, 0, sizeof(int));
+            for (int d = 1; d < N; d <<= 1) {
+                downsweep << <(d + blockSize - 1) / blockSize, blockSize >> > (N, d_idata, d);
+            }
+            //cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            //for (int i = 0; i < n; i++) {
+            //    std::cout << odata[i] << " ";
+            //}
+            //std::cout << std::endl;
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+			int *d_idata;
+			int N = nextPow2(n);
+			cudaMalloc((void**)&d_idata, N * sizeof(int));
+			cudaMemset(d_idata, 0, N * sizeof(int));
+			cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
             timer().startGpuTimer();
-            // TODO
+			scanOnDevice(N, d_idata);
             timer().endGpuTimer();
+			cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(d_idata);
         }
 
         /**
@@ -31,10 +137,32 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            int* d_bools;
+			int* d_indices;
+			int* d_idata;
+			int* d_odata;
+			int N = nextPow2(n);
+			cudaMalloc((void**)&d_bools, n * sizeof(int));
+			cudaMalloc((void**)&d_indices, N * sizeof(int));
+			cudaMalloc((void**)&d_idata, n * sizeof(int));
+			cudaMalloc((void**)&d_odata, n * sizeof(int));
+            cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
             timer().startGpuTimer();
-            // TODO
+			int blockSize = 256;
+			StreamCompaction::Common::kernMapToBoolean << <(n + blockSize - 1) / blockSize, blockSize >> > (n, d_bools, d_idata);
+			cudaMemcpy(d_indices, d_bools, n * sizeof(int), cudaMemcpyDeviceToDevice);
+			scanOnDevice(N, d_indices);
+            StreamCompaction::Common::kernScatter << < (n + blockSize - 1) / blockSize, blockSize >> > (n, d_odata, d_idata, d_bools, d_indices);
             timer().endGpuTimer();
-            return -1;
+            int sum;
+			cudaMemcpy(&sum, d_indices + n - 1, sizeof(int), cudaMemcpyDeviceToHost);
+			if (idata[n - 1] != 0) sum++;
+			cudaMemcpy(odata, d_odata, sum * sizeof(int), cudaMemcpyDeviceToHost);
+			cudaFree(d_bools);
+			cudaFree(d_indices);
+			cudaFree(d_idata);
+			cudaFree(d_odata);
+            return sum;
         }
     }
 }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 43088769..93ff151f 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,39 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+        __global__ void scan(int n, int *odata, const int *idata, int offset) {
+            int index = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (index >= n) return;
+            if (index >= offset) {
+                odata[index] = idata[index - offset] + idata[index];
+            } else {
+                odata[index] = idata[index];
+            }
+		}
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+			int* d_idata, * d_odata;
+			cudaMalloc((void**)&d_idata, n * sizeof(int));
+			cudaMalloc((void**)&d_odata, n * sizeof(int));
+			cudaMemcpy(d_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
             timer().startGpuTimer();
-            // TODO
+            int blockSize = 256;
+            int numBlocks = (n + blockSize - 1) / blockSize;
+            for (int offset = 1; offset < n; offset *= 2) {
+                scan << <numBlocks, blockSize >> > (n, d_odata, d_idata, offset);
+                std::swap(d_odata, d_idata);
+			}
+            cudaMemcpy(odata, d_idata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            for (int i = n - 1; i > 0; i--) {
+                odata[i] = odata[i - 1];
+            }
+			odata[0] = 0;
             timer().endGpuTimer();
+            cudaFree(d_idata);
+            cudaFree(d_odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e7..d4fa9641 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,16 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+			thrust::host_vector<int> h_idata(idata, idata + n);
+			thrust::device_vector<int> d_idata = h_idata;
+			thrust::device_vector<int> d_odata(n);
             timer().startGpuTimer();
             // TODO use `thrust::exclusive_scan`
             // example: for device_vectors dv_in and dv_out:
             // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(d_idata.begin(), d_idata.end(), d_odata.begin());
             timer().endGpuTimer();
+			thrust::copy(d_odata.begin(), d_odata.end(), odata);
         }
     }
 }