diff --git a/.clang-format b/.clang-format
new file mode 100644
index 00000000..c7a016af
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,6 @@
+---
+Language: Cpp
+IndentWidth: 4
+TabWidth: 4
+UseTab: Never
+ColumnLimit: 100
diff --git a/.clangd b/.clangd
new file mode 100644
index 00000000..8aefc26b
--- /dev/null
+++ b/.clangd
@@ -0,0 +1,55 @@
+# Apply this config conditionally to all C files
+If:
+  PathMatch: .*\.(c|h)$
+CompileFlags:
+  Compiler: /usr/bin/gcc
+
+---
+
+# Apply this config conditionally to all C++ files
+If:
+  PathMatch: .*\.(c|h)pp
+CompileFlags:
+  Compiler: /usr/bin/g++
+
+---
+
+# Apply this config conditionally to all CUDA files
+If:
+  PathMatch: .*\.cuh?
+CompileFlags:
+  Compiler: /usr/local/cuda/bin/nvcc
+
+---
+
+# Tweak the clangd parse settings for all files
+CompileFlags:
+  Add:
+    # report all errors
+    - "-ferror-limit=0"
+    - "-I/usr/local/cuda/include/cccl"
+  Remove:
+    # strip CUDA fatbin args
+    - "-Xfatbin*"
+    # strip CUDA arch flags
+    - "-gencode*"
+    - "--generate-code*"
+    # strip CUDA flags unknown to clang
+    - "-ccbin*"
+    - "--compiler-options*"
+    - "--expt-extended-lambda"
+    - "--expt-relaxed-constexpr"
+    - "-forward-unknown-to-host-compiler"
+    - "-Werror=cross-execution-space-call"
+    - "-arch=native"
+    - "--options-file"
+    - "-G"
+
+Hover:
+  ShowAKA: No
+InlayHints:
+  Enabled: No
+Diagnostics:
+  Suppress:
+    - "variadic_device_fn"
+    - "attributes_not_allowed"
diff --git a/.gitignore b/.gitignore
index a59ec565..2cbb92eb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,7 +25,8 @@ build
 .LSOverride
 
 # Icon must end with two \r
-Icon
+Icon
+
 
 # Thumbnails
 ._*
@@ -256,6 +257,11 @@ bld/
 # Uncomment if you have tasks that create the project's static files in wwwroot
 #wwwroot/
 
+.vscode/*
+!.vscode/launch.json
+!.vscode/extensions.json
+!.vscode/settings.json
+
 # MSTest test Results
 [Tt]est[Rr]esult*/
 [Bb]uild[Ll]og.*
@@ -269,6 +275,9 @@ TestResult.xml
 [Rr]eleasePS/
 dlldata.c
 
+# Clangd cache
+.cache/clangd
+
 # DNX
 project.lock.json
 artifacts/
diff --git a/.vscode/launch.json b/.vscode/launch.json
new file mode 100644
index 00000000..d1ddfabd
--- /dev/null
+++ b/.vscode/launch.json
@@ -0,0 +1,25 @@
+{
+  "$schema": "vscode://schemas/launch",
+  // Use IntelliSense to learn about possible attributes.
+  // Hover to view descriptions of existing attributes.
+  // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
+  "version": "0.2.0",
+  "configurations": [
+    {
+      "name": "CUDA C++: Launch",
+      "type": "cuda-gdb",
+      "request": "launch",
+      "environment": [
+        {"name": "WAYLAND_DISPLAY", "value": ""},
+        {"name": "XDG_SESSION_TYPE", "value": "x11"}
+      ],
+      "program": "${workspaceFolder}/build/bin/cis5650_stream_compaction_test",
+      "cwd": "${workspaceFolder}"
+    },
+    {
+      "name": "CUDA C++: Attach",
+      "type": "cuda-gdb",
+      "request": "attach"
+    }
+  ]
+}
\ No newline at end of file
diff --git a/.vscode/settings.json b/.vscode/settings.json
new file mode 100644
index 00000000..ce16cbc6
--- /dev/null
+++ b/.vscode/settings.json
@@ -0,0 +1,15 @@
+{
+  "files.associations": {
+    "*.cu": "cuda-cpp"
+  },
+  "[cpp]": {
+    "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd"
+  },
+  "[cuda-cpp]": {
+    "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd"
+  },
+  "[python]": {
+    "editor.defaultFormatter": "charliermarsh.ruff",
+    "editor.tabSize": 4,
+  },
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index 0e38ddb1..ed343098 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,150 @@
 CUDA Stream Compaction
 ======================
 
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
+![](img/graphs/scan_performance_nonpow2.png)
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+Q: (why is thrust so slow???) A: because I forgot to build binaries in release mode.
 
-### (TODO: Your README)
+**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 2**
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+* Thomas Shaw
+  * [LinkedIn](https://www.linkedin.com/in/thomas-shaw-54468b222), [personal website](https://tlshaw.me), [GitHub](https://github.com/printer83mph), etc.
+* Tested on: Fedora 42, Ryzen 7 5700x @ 4.67GHz, 32GB, RTX 2070 8GB
 
+
+## Features
+
+- CUDA exclusive scan and compaction implementations!
+- Work-efficient algorithm speeds up scan even more!
+  - Uses segmented blocks running in-place at the same time without worry of race condition, with arbitrary size input... can be adapted to use shared memory!
+- Faster scan than CPU!!
+- GPU Radix Sort implementation
+  - (tested against CPU version in `main`)
+- Python [analysis module](./analysis/README.md) for spitting out nice CSV files measuring performance
+
+
+## Performance Analysis
+
+Python scripts have been created in `analysis/` for easier stat collection. The [README](./analysis/README.md) within provides info on how to run these.
+
+### Block Size Optimizations
+
+Let's take a look at the resulting performance from block size choices, all normalized:
+
+![](img/graphs/block_sizes_all_normalized.png)
+
+Performance seems to fluctuate differently per-algorithm between possible block sizes from 64 and 1024. 
+
+See below all the different algorithms independently:
+| | | |
+|---|---|---|
+|  ![](img/graphs/block_sizes_scan_naive.png) Naive Scan | ![](img/graphs/block_sizes_scan_work_efficient.png) Work Efficient Scan | ![](img/graphs/block_sizes_scan_thrust.png) Thrust Scan |
+|  | ![](img/graphs/block_sizes_compact.png) Stream Compaction (Work-Efficient Scan) | ![](img/graphs/block_sizes_radix.png) Radix |
+
+It seems that the optimal block size for this machine is somewhere between 128 and 512, but that really depends on the algorithm.
+
+### Scan Implementation Comparisons
+
+See below the performance change over different array sizes, when aligned with powers of 2, and when not.
+
+![](img/graphs/scan_performance_nonpow2.png)
+
+![](img/graphs/scan_performance_pow2.png)
+
+Our Work-Efficient GPU Scan outperforms the Naive one at almost all array sizes. This is to be expected.
+
+The CPU eventually falls behind the work-efficient solution.
+
+It seems that Thrust is generally much faster than our solutions. What black magic are they working?
+
+### NSight analysis
+
+![](img/graphs/nsight_work_efficient.png)
+
+The above is our timeline view in NSight Systems.
+
+![](img/graphs/nsight_thrust.png)
+
+And the above is thrust.
+
+It seems they perform the entire operation inside a single kernel. They likely utilize shared memory access patterns, or somehow improve order-of-operations when it comes to memory access and computations.
+
+### Test output at n = 2^27
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  45   9  27  23   1  22  41  22  38  25  35  19  49 ...  35   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 195.421ms    (std::chrono Measured)
+    [   0  45  54  81 104 105 127 168 190 228 253 288 307 ... -1006580612 -1006580577 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 195.83ms    (std::chrono Measured)
+    [   0  45  54  81 104 105 127 168 190 228 253 288 307 ... -1006580641 -1006580637 ]
+    passed 
+==== naive scan, power-of-two ====
+   elapsed time: 178.568ms    (CUDA Measured)
+    passed 
+==== naive scan, non-power-of-two ====
+   elapsed time: 160.998ms    (CUDA Measured)
+    passed 
+==== work-efficient scan, power-of-two ====
+   elapsed time: 151.997ms    (CUDA Measured)
+    passed 
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 155.299ms    (CUDA Measured)
+    passed 
+==== thrust scan, power-of-two ====
+   elapsed time: 1000.16ms    (CUDA Measured)
+    passed 
+==== thrust scan, non-power-of-two ====
+   elapsed time: 999.244ms    (CUDA Measured)
+    passed 
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   1   2   3   3   0   3   3   2   0   2   0   0   0 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 342.001ms    (std::chrono Measured)
+    [   1   2   3   3   3   3   2   2   3   1   3   2   3 ...   2   3 ]
+    passed 
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 342.025ms    (std::chrono Measured)
+    [   1   2   3   3   3   3   2   2   3   1   3   2   3 ...   1   1 ]
+    passed 
+==== cpu compact with scan ====
+   elapsed time: 1084.35ms    (std::chrono Measured)
+    [   1   2   3   3   3   3   2   2   3   1   3   2   3 ...   2   3 ]
+    passed 
+==== work-efficient compact, power-of-two ====
+   elapsed time: 218.355ms    (CUDA Measured)
+    passed 
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 170.299ms    (CUDA Measured)
+    passed 
+
+*****************************
+** RADIX SORT TESTS **
+*****************************
+    [  26  47  38  48  23  28  48  10   6   9  30  37  21 ...  17   0 ]
+==== cpu sort, power-of-two ====
+   elapsed time: 17055.4ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+==== cpu sort, non-power-of-two ====
+   elapsed time: 17048.5ms    (std::chrono Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+==== gpu radix sort, power-of-two ====
+   elapsed time: 5650.03ms    (CUDA Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+    passed 
+==== gpu radix sort, non-power-of-two ====
+   elapsed time: 5632.99ms    (CUDA Measured)
+    [   0   0   0   0   0   0   0   0   0   0   0   0   0 ...  49  49 ]
+    passed 
+```
+
+### Radix Sort GPU implementation
+
+As seen above in the above output, the GPU Radix sort implementation is significantly faster than the CPU implementation at larger n values. For further research, it should be compared with Quicksort, or some other famous CPU sorting algorithm.
diff --git a/analysis/.gitignore b/analysis/.gitignore
new file mode 100644
index 00000000..e1ebabf6
--- /dev/null
+++ b/analysis/.gitignore
@@ -0,0 +1,217 @@
+
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#   Usually these files are written by a python script from a template
+#   before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+# Pipfile.lock
+
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+# uv.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+# poetry.lock
+# poetry.toml
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+# pdm.lock
+# pdm.toml
+.pdm-python
+.pdm-build/
+
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+# pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# Redis
+*.rdb
+*.aof
+*.pid
+
+# RabbitMQ
+mnesia/
+rabbitmq/
+rabbitmq-data/
+
+# ActiveMQ
+activemq-data/
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#   JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#   be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#   and can be added to the global gitignore or merged into this file.  For a more nuclear
+#   option (not recommended) you can uncomment the following to ignore the entire idea folder.
+# .idea/
+
+# Abstra
+#   Abstra is an AI-powered process automation framework.
+#   Ignore directories containing user credentials, local state, and settings.
+#   Learn more at https://abstra.io/docs
+.abstra/
+
+# Visual Studio Code
+#   Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore 
+#   that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#   and can be added to the global gitignore or merged into this file. However, if you prefer, 
+#   you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+
+# Ruff stuff:
+.ruff_cache/
+
+# PyPI configuration file
+.pypirc
+
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+
+# Streamlit
+.streamlit/secrets.toml
diff --git a/analysis/.python-version b/analysis/.python-version
new file mode 100644
index 00000000..24ee5b1b
--- /dev/null
+++ b/analysis/.python-version
@@ -0,0 +1 @@
+3.13
diff --git a/analysis/.temp/.gitignore b/analysis/.temp/.gitignore
new file mode 100644
index 00000000..c96a04f0
--- /dev/null
+++ b/analysis/.temp/.gitignore
@@ -0,0 +1,2 @@
+*
+!.gitignore
\ No newline at end of file
diff --git a/analysis/README.md b/analysis/README.md
new file mode 100644
index 00000000..7e85b5e9
--- /dev/null
+++ b/analysis/README.md
@@ -0,0 +1,24 @@
+# Performance Analysis
+
+## Usage
+
+To get set up, first install `uv`. Sync dependencies with `uv sync`.
+Ensure that your build is also set up properly in the `build` folder.
+
+Then, to run a block size comparisons, with `analysis` as working directory, run:
+
+```sh
+uv run src/main.py block-sizes
+```
+
+Or, alternatively, for scan algorithm comparisons:
+
+```sh
+uv run src/main.py scan-comparison
+```
+
+## Behavior
+
+This module works by temporarily editing constants in source files, building, and running the executable. It modifies constants like `#define TEST_SCAN 1` to enable or disable testing certain features, and modifies others like `#define BLOCK_SIZE 256` to tweak performance.
+
+It is built to be extensible with more tests.
\ No newline at end of file
diff --git a/analysis/pyproject.toml b/analysis/pyproject.toml
new file mode 100644
index 00000000..3fef754d
--- /dev/null
+++ b/analysis/pyproject.toml
@@ -0,0 +1,12 @@
+[project]
+name = "analysis"
+version = "0.1.0"
+description = "Script module for automated performance analysis"
+readme = "../README.md"
+requires-python = ">=3.13"
+dependencies = []
+
+[dependency-groups]
+dev = [
+    "ruff>=0.13.0",
+]
diff --git a/analysis/reports/block_sizes.csv b/analysis/reports/block_sizes.csv
new file mode 100644
index 00000000..97f6080a
--- /dev/null
+++ b/analysis/reports/block_sizes.csv
@@ -0,0 +1,7 @@
+Block Size,naive scan power-of-two,naive scan non-power-of-two,work-efficient scan power-of-two,work-efficient scan non-power-of-two,thrust scan power-of-two,thrust scan non-power-of-two,work-efficient compact power-of-two,work-efficient compact non-power-of-two,gpu radix sort power-of-two,gpu radix sort non-power-of-two
+32,27.1336,29.3257,8.2609,8.15974,0.495776,0.458752,8.42829,8.41302,289.301,260.37
+64,9.58538,9.53488,5.1136,6.06589,0.508704,0.467456,7.1607,7.08707,274.922,241.055
+128,9.51766,9.53098,5.11418,6.37702,0.516,0.462784,7.1904,7.15181,284.143,241.419
+256,9.58464,9.5049,5.12038,6.18496,0.505568,0.470464,7.3313,7.11728,265.27,243.207
+512,9.5625,9.50042,5.13203,5.0752,0.529312,0.47776,7.17606,7.15402,279.336,242.66
+1024,9.67619,9.66211,5.15446,5.69139,0.520192,0.47696,7.20762,7.07779,261.759,242.782
diff --git a/analysis/reports/scan_comparison.csv b/analysis/reports/scan_comparison.csv
new file mode 100644
index 00000000..2a5ea867
--- /dev/null
+++ b/analysis/reports/scan_comparison.csv
@@ -0,0 +1,13 @@
+Array Size,cpu scan power-of-two,cpu scan non-power-of-two,naive scan power-of-two,naive scan non-power-of-two,work-efficient scan power-of-two,work-efficient scan non-power-of-two,thrust scan power-of-two,thrust scan non-power-of-two
+262144,0.095401,0.072708,0.208576,0.164832,0.168544,0.114688,0.134624,0.098784
+524288,0.252088,0.143222,0.288,0.252032,0.240096,0.19872,0.135648,0.09872
+1048576,0.44262,0.298326,0.618432,0.57776,0.376,0.337472,0.152736,0.12288
+2097152,0.874619,0.703374,1.17187,1.15363,0.802208,0.765952,0.180736,0.141792
+4194304,1.75208,1.65104,2.292,2.23654,1.43482,1.38976,0.223584,0.183936
+8388608,4.0572,3.81021,4.7168,4.67971,3.24781,3.1889,0.313536,0.270432
+16777216,9.00436,7.46798,9.77485,9.78221,6.61827,6.56816,0.507904,0.463072
+33554432,27.2172,14.6883,20.2924,20.2419,13.5155,13.4656,0.83968,0.829344
+67108864,29.4156,29.1289,42.564,41.746,30.3128,26.5994,1.84298,1.52134
+134217728,58.0058,57.9033,88.6921,87.4699,45.1035,44.8515,3.00957,2.92611
+268435456,119.227,116.488,184.853,183.239,92.9893,92.4521,5.71802,5.76416
+536870912,237.653,228.767,381.553,381.138,192.289,190.455,11.3378,11.3172
diff --git a/analysis/src/analyze.py b/analysis/src/analyze.py
new file mode 100644
index 00000000..bc1ef389
--- /dev/null
+++ b/analysis/src/analyze.py
@@ -0,0 +1,97 @@
+from collections import defaultdict
+
+import helpers
+from pathlib import Path
+
+
+def test_optimize_block_sizes():
+    block_size_and_runtime_by_algorithm: defaultdict[str, list[tuple[int, int]]] = (
+        defaultdict(list)
+    )
+
+    for block_size_exp in range(5, 11):
+        block_size = pow(2, block_size_exp)
+
+        results = helpers.test_with_params(
+            filename="temp",
+            parameters=helpers.BuildParameters(
+                enable_scan=True,
+                enable_compact=True,
+                enable_radix=True,
+                block_size=block_size,
+                array_size_pow=24,
+            ),
+        )
+
+        for algorithm, runtime in results:
+            block_size_and_runtime_by_algorithm[algorithm].append((block_size, runtime))
+
+    # Print CSV header
+    algorithms = list(block_size_and_runtime_by_algorithm.keys())
+    algorithms = [alg for alg in algorithms if "cpu" not in alg.lower()]
+
+    # Get all block sizes (assuming they're the same for all algorithms)
+    block_sizes = [
+        pair[0] for pair in block_size_and_runtime_by_algorithm[algorithms[0]]
+    ]
+
+    # Create reports directory if it doesn't exist
+    output_path = Path.cwd() / "reports/block_sizes.csv"
+    output_path.parent.mkdir(exist_ok=True)
+
+    with open(output_path, "w") as f:
+        # Write CSV header
+        f.write("Block Size," + ",".join(algorithms) + "\n")
+
+        # Print each row
+        for i, block_size in enumerate(block_sizes):
+            row = [str(block_size)]
+            for algorithm in algorithms:
+                runtime = block_size_and_runtime_by_algorithm[algorithm][i][1]
+                row.append(str(runtime))
+            f.write(",".join(row) + "\n")
+
+
+def compare_scan_implementations():
+    array_size_and_runtime_by_algorithm: defaultdict[str, list[tuple[int, int]]] = (
+        defaultdict(list)
+    )
+
+    for array_size_exp in range(18, 30):
+        array_size = pow(2, array_size_exp)
+
+        results = helpers.test_with_params(
+            filename="temp",
+            parameters=helpers.BuildParameters(
+                enable_scan=True,
+                enable_compact=False,
+                enable_radix=False,
+                block_size=128,
+                array_size_pow=array_size_exp,
+            ),
+        )
+
+        for algorithm, runtime in results:
+            array_size_and_runtime_by_algorithm[algorithm].append((array_size, runtime))
+
+    # Create reports directory if it doesn't exist
+    output_path = Path.cwd() / "reports/scan_comparison.csv"
+    output_path.parent.mkdir(exist_ok=True)
+
+    # Get algorithms and array sizes
+    algorithms = list(array_size_and_runtime_by_algorithm.keys())
+    array_sizes = [
+        pair[0] for pair in array_size_and_runtime_by_algorithm[algorithms[0]]
+    ]
+
+    with open(output_path, "w") as f:
+        # Write CSV header
+        f.write("Array Size," + ",".join(algorithms) + "\n")
+
+        # Write each row
+        for i, array_size in enumerate(array_sizes):
+            row = [str(array_size)]
+            for algorithm in algorithms:
+                runtime = array_size_and_runtime_by_algorithm[algorithm][i][1]
+                row.append(str(runtime))
+            f.write(",".join(row) + "\n")
diff --git a/analysis/src/helpers.py b/analysis/src/helpers.py
new file mode 100644
index 00000000..aa487fdb
--- /dev/null
+++ b/analysis/src/helpers.py
@@ -0,0 +1,106 @@
+from contextlib import contextmanager
+from dataclasses import dataclass
+from pathlib import Path
+import subprocess
+from typing import Iterator
+
+root_dir = Path.cwd().parent
+
+DEFAULT_BLOCK_SIZE = 256
+
+
+@dataclass
+class BuildParameters:
+    enable_scan: bool = False
+    enable_compact: bool = False
+    enable_radix: bool = False
+    block_size: int = 256
+    array_size_pow: int = 24
+
+
+BASE_PARAMETERS = BuildParameters(
+    enable_scan=True,
+    enable_compact=True,
+    enable_radix=True,
+    block_size=256,
+    array_size_pow=8,
+)
+
+
+def set_build_params(parameters: BuildParameters) -> None:
+    with open(root_dir / "src" / "main.cpp", "r+") as main_file:
+        content = main_file.read()
+        main_file.seek(0)
+        lines = content.split("\n")
+        for i, line in enumerate(lines):
+            if line.strip().startswith("#define TEST_SCAN"):
+                lines[i] = f"#define TEST_SCAN {1 if parameters.enable_scan else 0}"
+            elif line.strip().startswith("#define TEST_COMPACT"):
+                lines[i] = (
+                    f"#define TEST_COMPACT {1 if parameters.enable_compact else 0}"
+                )
+            elif line.strip().startswith("#define TEST_RADIX"):
+                lines[i] = f"#define TEST_RADIX {1 if parameters.enable_radix else 0}"
+            elif line.strip().startswith("const int SIZE ="):
+                lines[i] = f"const int SIZE = 1 << {parameters.array_size_pow};   // feel free to change the size of array"
+        main_file.write("\n".join(lines))
+        main_file.truncate()
+
+    # Update BLOCK_SIZE wherever it's defined
+    for filename in ("naive.cu", "radix.cu"):
+        filepath = root_dir / "stream_compaction" / filename
+        with open(filepath, "r+") as naive_file:
+            content = naive_file.read()
+            naive_file.seek(0)
+            lines = content.split("\n")
+            for i, line in enumerate(lines):
+                if line.strip().startswith("#define BLOCK_SIZE"):
+                    lines[i] = f"#define BLOCK_SIZE {parameters.block_size}"
+            naive_file.write("\n".join(lines))
+            naive_file.truncate()
+
+
+@contextmanager
+def use_build_params(parameters: BuildParameters) -> Iterator[None]:
+    set_build_params(parameters)
+    yield
+    set_build_params(BASE_PARAMETERS)
+
+
+def run_test_pipe_to_file(filename: str):
+    with open(root_dir / "analysis/.temp" / f"{filename}.log", "w") as log_file:
+        subprocess.call(
+            "./bin/cis5650_stream_compaction_test",
+            cwd=root_dir / "build",
+            stdout=log_file,
+            stderr=log_file,
+        )
+
+
+def parse_test_results(filename: str) -> list[tuple[str, float]]:
+    log_path = root_dir / "analysis/.temp" / f"{filename}.log"
+    results = []
+
+    with open(log_path, "r") as log_file:
+        for line in log_file:
+            line = line.strip()
+            if line.startswith("====") and line.endswith("===="):
+                method_name = line[4:-4].strip()
+                method_name = method_name.replace(",", "")
+            elif line.startswith("elapsed time:") and "ms" in line:
+                # Extract time from format "elapsed time: 0.174592ms"
+                time_part = line.split("elapsed time:")[1].strip()
+                time_str = time_part.split("ms")[0].strip()
+                elapsed_time = float(time_str)
+                results.append((method_name, elapsed_time))
+
+    return results
+
+
+def test_with_params(
+    *, filename: str, parameters: BuildParameters
+) -> list[tuple[str, float]]:
+    with use_build_params(parameters):
+        subprocess.call(["cmake", "--build", "."], cwd=root_dir / "build")
+    run_test_pipe_to_file(filename)
+    return parse_test_results(filename)
diff --git a/analysis/src/main.py b/analysis/src/main.py
new file mode 100644
index 00000000..91f8d29b
--- /dev/null
+++ b/analysis/src/main.py
@@ -0,0 +1,19 @@
+import sys
+import analyze
+
+
+def main():
+    match sys.argv[1].lower():
+        case "block-sizes":
+            print("running block size optimization analysis...")
+            analyze.test_optimize_block_sizes()
+            print("done! results stored to reports/block_sizes.csv.")
+
+        case "scan-comparison":
+            print("running scan implementation comparison analysis...")
+            analyze.compare_scan_implementations()
+            print("done! results stored to reports/scan_comparison.csv.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/analysis/uv.lock b/analysis/uv.lock
new file mode 100644
index 00000000..6e36b8d0
--- /dev/null
+++ b/analysis/uv.lock
@@ -0,0 +1,44 @@
+version = 1
+revision = 3
+requires-python = ">=3.13"
+
+[[package]]
+name = "analysis"
+version = "0.1.0"
+source = { virtual = "." }
+
+[package.dev-dependencies]
+dev = [
+    { name = "ruff" },
+]
+
+[package.metadata]
+
+[package.metadata.requires-dev]
+dev = [{ name = "ruff", specifier = ">=0.13.0" }]
+
+[[package]]
+name = "ruff"
+version = "0.13.0"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/6e/1a/1f4b722862840295bcaba8c9e5261572347509548faaa99b2d57ee7bfe6a/ruff-0.13.0.tar.gz", hash = "sha256:5b4b1ee7eb35afae128ab94459b13b2baaed282b1fb0f472a73c82c996c8ae60", size = 5372863, upload-time = "2025-09-10T16:25:37.917Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/ac/fe/6f87b419dbe166fd30a991390221f14c5b68946f389ea07913e1719741e0/ruff-0.13.0-py3-none-linux_armv6l.whl", hash = "sha256:137f3d65d58ee828ae136a12d1dc33d992773d8f7644bc6b82714570f31b2004", size = 12187826, upload-time = "2025-09-10T16:24:39.5Z" },
+    { url = "https://files.pythonhosted.org/packages/e4/25/c92296b1fc36d2499e12b74a3fdb230f77af7bdf048fad7b0a62e94ed56a/ruff-0.13.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:21ae48151b66e71fd111b7d79f9ad358814ed58c339631450c66a4be33cc28b9", size = 12933428, upload-time = "2025-09-10T16:24:43.866Z" },
+    { url = "https://files.pythonhosted.org/packages/44/cf/40bc7221a949470307d9c35b4ef5810c294e6cfa3caafb57d882731a9f42/ruff-0.13.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:64de45f4ca5441209e41742d527944635a05a6e7c05798904f39c85bafa819e3", size = 12095543, upload-time = "2025-09-10T16:24:46.638Z" },
+    { url = "https://files.pythonhosted.org/packages/f1/03/8b5ff2a211efb68c63a1d03d157e924997ada87d01bebffbd13a0f3fcdeb/ruff-0.13.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b2c653ae9b9d46e0ef62fc6fbf5b979bda20a0b1d2b22f8f7eb0cde9f4963b8", size = 12312489, upload-time = "2025-09-10T16:24:49.556Z" },
+    { url = "https://files.pythonhosted.org/packages/37/fc/2336ef6d5e9c8d8ea8305c5f91e767d795cd4fc171a6d97ef38a5302dadc/ruff-0.13.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4cec632534332062bc9eb5884a267b689085a1afea9801bf94e3ba7498a2d207", size = 11991631, upload-time = "2025-09-10T16:24:53.439Z" },
+    { url = "https://files.pythonhosted.org/packages/39/7f/f6d574d100fca83d32637d7f5541bea2f5e473c40020bbc7fc4a4d5b7294/ruff-0.13.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dcd628101d9f7d122e120ac7c17e0a0f468b19bc925501dbe03c1cb7f5415b24", size = 13720602, upload-time = "2025-09-10T16:24:56.392Z" },
+    { url = "https://files.pythonhosted.org/packages/fd/c8/a8a5b81d8729b5d1f663348d11e2a9d65a7a9bd3c399763b1a51c72be1ce/ruff-0.13.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:afe37db8e1466acb173bb2a39ca92df00570e0fd7c94c72d87b51b21bb63efea", size = 14697751, upload-time = "2025-09-10T16:24:59.89Z" },
+    { url = "https://files.pythonhosted.org/packages/57/f5/183ec292272ce7ec5e882aea74937f7288e88ecb500198b832c24debc6d3/ruff-0.13.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f96a8d90bb258d7d3358b372905fe7333aaacf6c39e2408b9f8ba181f4b6ef2", size = 14095317, upload-time = "2025-09-10T16:25:03.025Z" },
+    { url = "https://files.pythonhosted.org/packages/9f/8d/7f9771c971724701af7926c14dab31754e7b303d127b0d3f01116faef456/ruff-0.13.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b5e3d883e4f924c5298e3f2ee0f3085819c14f68d1e5b6715597681433f153", size = 13144418, upload-time = "2025-09-10T16:25:06.272Z" },
+    { url = "https://files.pythonhosted.org/packages/a8/a6/7985ad1778e60922d4bef546688cd8a25822c58873e9ff30189cfe5dc4ab/ruff-0.13.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03447f3d18479df3d24917a92d768a89f873a7181a064858ea90a804a7538991", size = 13370843, upload-time = "2025-09-10T16:25:09.965Z" },
+    { url = "https://files.pythonhosted.org/packages/64/1c/bafdd5a7a05a50cc51d9f5711da704942d8dd62df3d8c70c311e98ce9f8a/ruff-0.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:fbc6b1934eb1c0033da427c805e27d164bb713f8e273a024a7e86176d7f462cf", size = 13321891, upload-time = "2025-09-10T16:25:12.969Z" },
+    { url = "https://files.pythonhosted.org/packages/bc/3e/7817f989cb9725ef7e8d2cee74186bf90555279e119de50c750c4b7a72fe/ruff-0.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a8ab6a3e03665d39d4a25ee199d207a488724f022db0e1fe4002968abdb8001b", size = 12119119, upload-time = "2025-09-10T16:25:16.621Z" },
+    { url = "https://files.pythonhosted.org/packages/58/07/9df080742e8d1080e60c426dce6e96a8faf9a371e2ce22eef662e3839c95/ruff-0.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d2a5c62f8ccc6dd2fe259917482de7275cecc86141ee10432727c4816235bc41", size = 11961594, upload-time = "2025-09-10T16:25:19.49Z" },
+    { url = "https://files.pythonhosted.org/packages/6a/f4/ae1185349197d26a2316840cb4d6c3fba61d4ac36ed728bf0228b222d71f/ruff-0.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:b7b85ca27aeeb1ab421bc787009831cffe6048faae08ad80867edab9f2760945", size = 12933377, upload-time = "2025-09-10T16:25:22.371Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/39/e776c10a3b349fc8209a905bfb327831d7516f6058339a613a8d2aaecacd/ruff-0.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:79ea0c44a3032af768cabfd9616e44c24303af49d633b43e3a5096e009ebe823", size = 13418555, upload-time = "2025-09-10T16:25:25.681Z" },
+    { url = "https://files.pythonhosted.org/packages/46/09/dca8df3d48e8b3f4202bf20b1658898e74b6442ac835bfe2c1816d926697/ruff-0.13.0-py3-none-win32.whl", hash = "sha256:4e473e8f0e6a04e4113f2e1de12a5039579892329ecc49958424e5568ef4f768", size = 12141613, upload-time = "2025-09-10T16:25:28.664Z" },
+    { url = "https://files.pythonhosted.org/packages/61/21/0647eb71ed99b888ad50e44d8ec65d7148babc0e242d531a499a0bbcda5f/ruff-0.13.0-py3-none-win_amd64.whl", hash = "sha256:48e5c25c7a3713eea9ce755995767f4dcd1b0b9599b638b12946e892123d1efb", size = 13258250, upload-time = "2025-09-10T16:25:31.773Z" },
+    { url = "https://files.pythonhosted.org/packages/e1/a3/03216a6a86c706df54422612981fb0f9041dbb452c3401501d4a22b942c9/ruff-0.13.0-py3-none-win_arm64.whl", hash = "sha256:ab80525317b1e1d38614addec8ac954f1b3e662de9d59114ecbf771d00cf613e", size = 12312357, upload-time = "2025-09-10T16:25:35.595Z" },
+]
diff --git a/img/graphs/block_sizes_all_normalized.png b/img/graphs/block_sizes_all_normalized.png
new file mode 100644
index 00000000..d4601b4f
Binary files /dev/null and b/img/graphs/block_sizes_all_normalized.png differ
diff --git a/img/graphs/block_sizes_compact.png b/img/graphs/block_sizes_compact.png
new file mode 100644
index 00000000..0c61184c
Binary files /dev/null and b/img/graphs/block_sizes_compact.png differ
diff --git a/img/graphs/block_sizes_radix.png b/img/graphs/block_sizes_radix.png
new file mode 100644
index 00000000..6f072fc6
Binary files /dev/null and b/img/graphs/block_sizes_radix.png differ
diff --git a/img/graphs/block_sizes_scan_naive.png b/img/graphs/block_sizes_scan_naive.png
new file mode 100644
index 00000000..f821b744
Binary files /dev/null and b/img/graphs/block_sizes_scan_naive.png differ
diff --git a/img/graphs/block_sizes_scan_thrust.png b/img/graphs/block_sizes_scan_thrust.png
new file mode 100644
index 00000000..99505e79
Binary files /dev/null and b/img/graphs/block_sizes_scan_thrust.png differ
diff --git a/img/graphs/block_sizes_scan_work_efficient.png b/img/graphs/block_sizes_scan_work_efficient.png
new file mode 100644
index 00000000..d076808c
Binary files /dev/null and b/img/graphs/block_sizes_scan_work_efficient.png differ
diff --git a/img/graphs/nsight_thrust.png b/img/graphs/nsight_thrust.png
new file mode 100644
index 00000000..c033b534
Binary files /dev/null and b/img/graphs/nsight_thrust.png differ
diff --git a/img/graphs/nsight_work_efficient.png b/img/graphs/nsight_work_efficient.png
new file mode 100644
index 00000000..d6337378
Binary files /dev/null and b/img/graphs/nsight_work_efficient.png differ
diff --git a/img/graphs/scan_performance_nonpow2.png b/img/graphs/scan_performance_nonpow2.png
new file mode 100644
index 00000000..2b5e27f6
Binary files /dev/null and b/img/graphs/scan_performance_nonpow2.png differ
diff --git a/img/graphs/scan_performance_pow2.png b/img/graphs/scan_performance_pow2.png
new file mode 100644
index 00000000..327652f2
Binary files /dev/null and b/img/graphs/scan_performance_pow2.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 3d5c8820..cb3e4758 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -6,94 +6,106 @@
  * @copyright University of Pennsylvania
  */
 
-#include <cstdio>
-#include <stream_compaction/cpu.h>
-#include <stream_compaction/naive.h>
-#include <stream_compaction/efficient.h>
-#include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
-
-const int SIZE = 1 << 8; // feel free to change the size of array
+#include <cstdio>
+#include <stream_compaction/cpu.cuh>
+#include <stream_compaction/efficient.cuh>
+#include <stream_compaction/naive.cuh>
+#include <stream_compaction/radix.cuh>
+#include <stream_compaction/thrust.cuh>
+
+// Control which algorithms to test!
+#define TEST_SCAN 1
+#define TEST_COMPACT 1
+#define TEST_RADIX 1
+
+const int SIZE = 1 << 8;   // feel free to change the size of array
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
-int *a = new int[SIZE];
-int *b = new int[SIZE];
-int *c = new int[SIZE];
+int *sourceData = new int[SIZE];
+int *referenceResult = new int[SIZE];
+int *referenceResultNPOT = new int[SIZE];
+int *result = new int[SIZE];
 
-int main(int argc, char* argv[]) {
+int main(int argresult, char *argv[]) {
     // Scan tests
 
+#if TEST_SCAN
+
     printf("\n");
     printf("****************\n");
     printf("** SCAN TESTS **\n");
     printf("****************\n");
 
-    genArray(SIZE - 1, a, 50);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
+    genArray(SIZE - 1, sourceData, 50); // Leave a 0 at the end to test that edge case
+    sourceData[SIZE - 1] = 0;
+    printArray(SIZE, sourceData, true);
 
     // initialize b using StreamCompaction::CPU::scan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct.
     // At first all cases passed because b && c are all zeroes.
-    zeroArray(SIZE, b);
+    zeroArray(SIZE, referenceResult);
     printDesc("cpu scan, power-of-two");
-    StreamCompaction::CPU::scan(SIZE, b, a);
+    StreamCompaction::CPU::scan(SIZE, referenceResult, sourceData);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(SIZE, b, true);
+    printArray(SIZE, referenceResult, true);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, referenceResultNPOT);
     printDesc("cpu scan, non-power-of-two");
-    StreamCompaction::CPU::scan(NPOT, c, a);
+    StreamCompaction::CPU::scan(NPOT, referenceResultNPOT, sourceData);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
+    printArray(NPOT, referenceResultNPOT, true);
+    printCmpResult(NPOT, referenceResultNPOT, referenceResultNPOT);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("naive scan, power-of-two");
-    StreamCompaction::Naive::scan(SIZE, c, a);
+    StreamCompaction::Naive::scan(SIZE, result, sourceData);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
+    // printArray(SIZE, result, true);
+    printCmpResult(SIZE, referenceResult, result);
 
     /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan
     onesArray(SIZE, c);
     printDesc("1s array for finding bugs");
-    StreamCompaction::Naive::scan(SIZE, c, a);
-    printArray(SIZE, c, true); */
+    StreamCompaction::Naive::scan(SIZE, result, a);
+    printArray(SIZE, result, true); */
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("naive scan, non-power-of-two");
-    StreamCompaction::Naive::scan(NPOT, c, a);
+    StreamCompaction::Naive::scan(NPOT, result, sourceData);
     printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(NPOT, b, c);
+    // printArray(SIZE, result, true);
+    printCmpResult(NPOT, referenceResultNPOT, result);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("work-efficient scan, power-of-two");
-    StreamCompaction::Efficient::scan(SIZE, c, a);
+    StreamCompaction::Efficient::scan(SIZE, result, sourceData);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
+    // printArray(SIZE, result, true);
+    printCmpResult(SIZE, referenceResult, result);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
+    StreamCompaction::Efficient::scan(NPOT, result, sourceData);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
+    // printArray(NPOT, result, true);
+    printCmpResult(NPOT, referenceResultNPOT, result);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("thrust scan, power-of-two");
-    StreamCompaction::Thrust::scan(SIZE, c, a);
+    StreamCompaction::Thrust::scan(SIZE, result, sourceData);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(SIZE, c, true);
-    printCmpResult(SIZE, b, c);
+    // printArray(SIZE, result, true);
+    printCmpResult(SIZE, referenceResult, result);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("thrust scan, non-power-of-two");
-    StreamCompaction::Thrust::scan(NPOT, c, a);
+    StreamCompaction::Thrust::scan(NPOT, result, sourceData);
     printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(NPOT, c, true);
-    printCmpResult(NPOT, b, c);
+    // printArray(NPOT, result, true);
+    printCmpResult(NPOT, referenceResultNPOT, result);
+
+#endif
+#if TEST_COMPACT
 
     printf("\n");
     printf("*****************************\n");
@@ -102,53 +114,100 @@ int main(int argc, char* argv[]) {
 
     // Compaction tests
 
-    genArray(SIZE - 1, a, 4);  // Leave a 0 at the end to test that edge case
-    a[SIZE - 1] = 0;
-    printArray(SIZE, a, true);
+    genArray(SIZE - 1, sourceData, 4); // Leave a 0 at the end to test that edge case
+    sourceData[SIZE - 1] = 0;
+    printArray(SIZE, sourceData, true);
 
     int count, expectedCount, expectedNPOT;
 
     // initialize b using StreamCompaction::CPU::compactWithoutScan you implement
     // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct.
-    zeroArray(SIZE, b);
+    zeroArray(SIZE, referenceResult);
     printDesc("cpu compact without scan, power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a);
+    count = StreamCompaction::CPU::compactWithoutScan(SIZE, referenceResult, sourceData);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedCount = count;
-    printArray(count, b, true);
-    printCmpLenResult(count, expectedCount, b, b);
+    printArray(count, referenceResult, true);
+    printCmpLenResult(count, expectedCount, referenceResult, referenceResult);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, referenceResultNPOT);
     printDesc("cpu compact without scan, non-power-of-two");
-    count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a);
+    count = StreamCompaction::CPU::compactWithoutScan(NPOT, referenceResultNPOT, sourceData);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
     expectedNPOT = count;
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
+    printArray(count, referenceResultNPOT, true);
+    printCmpLenResult(count, expectedNPOT, referenceResultNPOT, referenceResultNPOT);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("cpu compact with scan");
-    count = StreamCompaction::CPU::compactWithScan(SIZE, c, a);
+    count = StreamCompaction::CPU::compactWithScan(SIZE, result, sourceData);
     printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)");
-    printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
+    printArray(count, result, true);
+    printCmpLenResult(count, expectedCount, referenceResult, result);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("work-efficient compact, power-of-two");
-    count = StreamCompaction::Efficient::compact(SIZE, c, a);
+    count = StreamCompaction::Efficient::compact(SIZE, result, sourceData);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedCount, b, c);
+    // printArray(count, result, true);
+    printCmpLenResult(count, expectedCount, referenceResult, result);
 
-    zeroArray(SIZE, c);
+    zeroArray(SIZE, result);
     printDesc("work-efficient compact, non-power-of-two");
-    count = StreamCompaction::Efficient::compact(NPOT, c, a);
+    count = StreamCompaction::Efficient::compact(NPOT, result, sourceData);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
-    //printArray(count, c, true);
-    printCmpLenResult(count, expectedNPOT, b, c);
+    // printArray(count, result, true);
+    printCmpLenResult(count, expectedNPOT, referenceResultNPOT, result);
+
+#endif
+#if TEST_RADIX
+
+    printf("\n");
+    printf("*****************************\n");
+    printf("** RADIX SORT TESTS **\n");
+    printf("*****************************\n");
 
+    genArray(SIZE - 1, sourceData, 50); // Leave a 0 at the end to test that edge case
+    // a[SIZE - 1] = 0;
+    printArray(SIZE, sourceData, true);
+
+    zeroArray(SIZE, referenceResult);
+    printDesc("cpu sort, power-of-two");
+    StreamCompaction::Radix::cpu_sort(SIZE, referenceResult, sourceData);
+    printElapsedTime(StreamCompaction::Radix::timer().getCpuElapsedTimeForPreviousOperation(),
+                     "(std::chrono Measured)");
+    printArray(SIZE, referenceResult, true);
+
+    zeroArray(NPOT, result);
+    printDesc("cpu sort, non-power-of-two");
+    StreamCompaction::Radix::cpu_sort(NPOT, referenceResultNPOT, sourceData);
+    printElapsedTime(StreamCompaction::Radix::timer().getCpuElapsedTimeForPreviousOperation(),
+                     "(std::chrono Measured)");
+    printArray(NPOT, referenceResultNPOT, true);
+
+    zeroArray(SIZE, result);
+    printDesc("gpu radix sort, power-of-two");
+    StreamCompaction::Radix::sort(SIZE, result, sourceData);
+    printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(),
+                     "(CUDA Measured)");
+    printArray(SIZE, result, true);
+    printCmpResult(SIZE, referenceResult, result);
+
+    zeroArray(NPOT, result);
+    printDesc("gpu radix sort, non-power-of-two");
+    StreamCompaction::Radix::sort(NPOT, result, sourceData);
+    printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(),
+                     "(CUDA Measured)");
+    printArray(NPOT, result, true);
+    printCmpResult(NPOT, referenceResultNPOT, result);
+
+#endif
+
+#ifdef _WIN32
     system("pause"); // stop Win32 console from closing on exit
-    delete[] a;
-    delete[] b;
-    delete[] c;
+#endif
+    delete[] sourceData;
+    delete[] referenceResult;
+    delete[] referenceResultNPOT;
+    delete[] result;
 }
diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt
index 19511caa..3be7f7c8 100644
--- a/stream_compaction/CMakeLists.txt
+++ b/stream_compaction/CMakeLists.txt
@@ -1,9 +1,10 @@
 set(headers
-    "common.h"
-    "cpu.h"
-    "naive.h"
-    "efficient.h"
-    "thrust.h"
+    "common.cuh"
+    "cpu.cuh"
+    "naive.cuh"
+    "efficient.cuh"
+    "radix.cuh"
+    "thrust.cuh"
     )
 
 set(sources
@@ -11,6 +12,7 @@ set(sources
     "cpu.cu"
     "naive.cu"
     "efficient.cu"
+    "radix.cu"
     "thrust.cu"
     )
 
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d630..dca2e92e 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -1,4 +1,4 @@
-#include "common.h"
+#include "common.cuh"
 
 void checkCUDAErrorFn(const char *msg, const char *file, int line) {
     cudaError_t err = cudaGetLastError();
@@ -23,7 +23,12 @@ namespace StreamCompaction {
          * which map to 0 will be removed, and elements which map to 1 will be kept.
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
-            // TODO
+            int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (threadIndex >= n)
+                return;
+
+            // Cast to boolean to check against 0
+            bools[threadIndex] = (bool)idata[threadIndex];
         }
 
         /**
@@ -32,8 +37,37 @@ namespace StreamCompaction {
          */
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
-            // TODO
+            int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (threadIndex >= n)
+                return;
+
+            if (bools[threadIndex]) {
+                odata[indices[threadIndex]] = idata[threadIndex];
+            }
         }
 
+        /**
+         * Similar to kernMapToBoolean, but the result (array of bools) is built from the kth bits
+         * of input array elements. Bits are 0-indexed, i.e. kth power of 2.
+         */
+        __global__ void kernMapToBit(int n, int *bools, const int *idata, int bit) {
+            int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (threadIndex >= n)
+                return;
+
+            int bitValue = (idata[threadIndex] >> bit) & 1;
+            bools[threadIndex] = bitValue;
+        }
+
+        /**
+         * Dead simple: inverts all elements of input array into output array.
+         */
+        __global__ void kernInvert(int n, int *bools) {
+            int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (threadIndex >= n)
+                return;
+
+            bools[threadIndex] = bools[threadIndex] ^ 1;
+        }
     }
 }
diff --git a/stream_compaction/common.h b/stream_compaction/common.cuh
similarity index 96%
rename from stream_compaction/common.h
rename to stream_compaction/common.cuh
index d2c1fed9..0b333c38 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.cuh
@@ -3,11 +3,9 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
 
+#include <chrono>
 #include <cstdio>
 #include <cstring>
-#include <cmath>
-#include <algorithm>
-#include <chrono>
 #include <stdexcept>
 
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
@@ -37,6 +35,10 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices);
 
+        __global__ void kernMapToBit(int n, int *bools, const int *idata, int bit);
+
+        __global__ void kernInvert(int n, int *bools);
+
         /**
         * This class is used for timing the performance
         * Uncopyable and unmovable
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa115..6514e71e 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -1,7 +1,6 @@
-#include <cstdio>
-#include "cpu.h"
+#include "cpu.cuh"
 
-#include "common.h"
+#include "common.cuh"
 
 namespace StreamCompaction {
     namespace CPU {
@@ -19,7 +18,13 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+
+            // Ye olde iterative implementation
+            odata[0] = 0;
+            for (int i = 0; i < n; ++i) {
+                odata[i] = idata[i - 1] + odata[i - 1];
+            }
+
             timer().endCpuTimer();
         }
 
@@ -30,9 +35,19 @@ namespace StreamCompaction {
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+
+            // Sequentially add up all nonzero input numbers in output array
+            int matchingCount = 0;
+            for (int j = 0; j < n; ++j) {
+                if (idata[j] == 0)
+                    continue;
+
+                odata[matchingCount] = idata[j];
+                matchingCount++;
+            }
+
             timer().endCpuTimer();
-            return -1;
+            return matchingCount;
         }
 
         /**
@@ -42,9 +57,40 @@ namespace StreamCompaction {
          */
         int compactWithScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+
+            int totalMatchingCount = 0;
+
+            // Populate temp array with 0 if input is 0, 1 otherwise
+            int *matchingMask = new int[n];
+            int *scannedIndices = new int[n];
+            for (int i = 0; i < n; ++i) {
+                if (idata[i]) {
+                    matchingMask[i] = 1;
+                    totalMatchingCount++;
+                } else {
+                    matchingMask[i] = 0;
+                }
+            }
+
+            // Run ye olde scan (without timer since we're already timing)
+            scannedIndices[0] = 0;
+            for (int i = 1; i < n; ++i) {
+                scannedIndices[i] = matchingMask[i - 1] + scannedIndices[i - 1];
+            }
+
+            // Pull from input using scanned indices
+            for (int i = 0; i < n; ++i) {
+                if (!matchingMask[i])
+                    continue;
+
+                odata[scannedIndices[i]] = idata[i];
+            }
+
+            delete[] matchingMask;
+            delete[] scannedIndices;
+
             timer().endCpuTimer();
-            return -1;
+            return totalMatchingCount;
         }
     }
 }
diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.cuh
similarity index 93%
rename from stream_compaction/cpu.h
rename to stream_compaction/cpu.cuh
index 873c0476..06aecb50 100644
--- a/stream_compaction/cpu.h
+++ b/stream_compaction/cpu.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "common.h"
+#include "common.cuh"
 
 namespace StreamCompaction {
     namespace CPU {
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346ee..f0d88da1 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -1,7 +1,11 @@
+#include "common.cuh"
+#include "efficient.cuh"
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "common.h"
-#include "efficient.h"
+#include <vector>
+
+// this has gotta be a power of 2!
+#define BLOCK_SIZE 256
 
 namespace StreamCompaction {
     namespace Efficient {
@@ -12,15 +16,210 @@ namespace StreamCompaction {
             return timer;
         }
 
+        /**
+         * Perform segmented addition scan on an array, separated into blocks.
+         *
+         * @param N the number of total elements. Expected to be a power of 2.
+         * @param maxDepth should be ilog2ceil(N). This should just be precomputed.
+         * @param g_arrayToScan the full array, in global memory. This will be modified in-place.
+         *   This is expected to be of size N.
+         * @param g_blockTotalSums where the full sums of each block will be saved.
+         *   It is expected to be of size `N / blockDim.x`.
+         */
+        __global__ void kernExclusiveScanByBlocks(int N, int maxDepth, int *g_arrayToScan,
+                                                  int *g_blockTotalSums) {
+            // TODO: probably use shared memory to make this faster, instead of pulling from evil
+            // global world...
+
+            int blockStartIndex = (blockIdx.x * blockDim.x);
+            int localThreadIndex = threadIdx.x;
+            int globalThreadIndex = blockStartIndex + localThreadIndex;
+
+            // Do awesome upsweep in-place with increasing depth
+            for (int d = 0; d < maxDepth; ++d) {
+                int halfChunk = 1 << d;
+                int fullChunk = halfChunk << 1;
+
+                // Each layer gets blockSize >> (d + 1) operations
+                int numThreads = blockDim.x / fullChunk;
+
+                if (localThreadIndex < numThreads) {
+                    // K is global index of first element of "chunk" we're operating on
+                    int globalK = blockStartIndex + localThreadIndex * fullChunk;
+                    g_arrayToScan[globalK + fullChunk - 1] +=
+                        g_arrayToScan[globalK + halfChunk - 1];
+                }
+                __syncthreads();
+            }
+
+            // Save the last goober of each block for later, reset to 0 for down-sweep
+            if (threadIdx.x == BLOCK_SIZE - 1) {
+                g_blockTotalSums[blockIdx.x] = g_arrayToScan[globalThreadIndex];
+                g_arrayToScan[globalThreadIndex] = 0;
+            }
+            __syncthreads();
+
+            // Do awesome downsweep in-place with decreasing depth
+            for (int d = maxDepth - 1; d >= 0; --d) {
+                int halfChunk = 1 << d;
+                int fullChunk = halfChunk << 1;
+
+                // Each layer gets blockSize >> (d + 1) operations
+                int numThreads = blockDim.x / fullChunk;
+
+                if (localThreadIndex < numThreads) {
+                    // K is global index of first element of "chunk" we're operating on
+                    int globalK = blockStartIndex + localThreadIndex * fullChunk;
+
+                    // Copy right value, add left one in-place, then set left to copied value
+                    int oldRightValue = g_arrayToScan[globalK + fullChunk - 1];
+                    g_arrayToScan[globalK + fullChunk - 1] +=
+                        g_arrayToScan[globalK + halfChunk - 1];
+                    g_arrayToScan[globalK + halfChunk - 1] = oldRightValue;
+                }
+                __syncthreads();
+            }
+        }
+
+        /**
+         * Perform exclusive scan on a single block of data.
+         *
+         * @param N the number of elements in the block. Expected to be a power of 2.
+         * @param maxDepth should be ilog2ceil(N). This should just be precomputed.
+         * @param g_data the array to scan in global memory. This will be modified in-place.
+         */
+        __global__ void kernExclusiveScanOneBlock(int N, int maxDepth, int *g_arrayToScan) {
+            // TODO: probably use shared memory to make this faster, instead of pulling from evil
+            // global world...
+
+            int localThreadIndex = threadIdx.x;
+
+            // Do upsweep in-place with increasing depth
+            for (int d = 0; d < maxDepth; ++d) {
+                int halfChunk = 1 << d;
+                int fullChunk = halfChunk << 1;
+
+                // Each layer gets blockSize >> (d + 1) operations
+                int numThreads = blockDim.x / fullChunk;
+
+                if (localThreadIndex < numThreads) {
+                    // K is index of first element of "chunk" we're operating on
+                    int k = localThreadIndex * fullChunk;
+                    g_arrayToScan[k + fullChunk - 1] += g_arrayToScan[k + halfChunk - 1];
+                }
+                __syncthreads();
+            }
+
+            // Reset the last element to 0 for down-sweep
+            if (threadIdx.x == 0) {
+                g_arrayToScan[N - 1] = 0;
+            }
+            __syncthreads();
+
+            // Do downsweep in-place with decreasing depth
+            for (int d = maxDepth - 1; d >= 0; --d) {
+                int halfChunk = 1 << d;
+                int fullChunk = halfChunk << 1;
+
+                // Each layer gets blockSize >> (d + 1) operations
+                int numThreads = blockDim.x / fullChunk;
+
+                if (localThreadIndex < numThreads) {
+                    // K is index of first element of "chunk" we're operating on
+                    int k = localThreadIndex * fullChunk;
+
+                    // Copy right value, add left one in-place, then set left to copied value
+                    int oldRightValue = g_arrayToScan[k + fullChunk - 1];
+                    g_arrayToScan[k + fullChunk - 1] += g_arrayToScan[k + halfChunk - 1];
+                    g_arrayToScan[k + halfChunk - 1] = oldRightValue;
+                }
+                __syncthreads();
+            }
+        }
+
+        __global__ void kernAddChunkedSums(int N, int *g_chunkScannedArray, int *g_blockTotalSums) {
+            int globalThreadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            g_chunkScannedArray[globalThreadIndex] += g_blockTotalSums[blockIdx.x];
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
-        void scan(int n, int *odata, const int *idata) {
-            timer().startGpuTimer();
-            // TODO
-            timer().endGpuTimer();
+        void scan(int n, int *odata, const int *idata, bool startGpuTimer) {
+            int maxDepth = ilog2ceil(n);
+            int N = 1 << maxDepth;
+
+            // Pointers to all the extra scans needed when block size is smaller than N
+            std::vector<int *> dev_arrays;
+
+            // Create device mem pointers with decreasing sizes
+            int currentSize = N;
+            while (currentSize > BLOCK_SIZE) {
+                int *dev_array;
+                cudaMalloc(&dev_array, currentSize * sizeof(int));
+                dev_arrays.push_back(dev_array);
+                currentSize = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            }
+
+            // Allocate one just for the smallest, always occurs
+            int *dev_smallest_array;
+            cudaMalloc(&dev_smallest_array, currentSize * sizeof(int));
+            dev_arrays.push_back(dev_smallest_array);
+
+            // Copy input data to first device array
+            cudaMemcpy(dev_arrays[0], idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            // Fill rest of first array with zeros if n < N
+            if (n < N) {
+                cudaMemset(dev_arrays[0] + n, 0, (N - n) * sizeof(int));
+            }
+
+            if (startGpuTimer)
+                timer().startGpuTimer();
+
+            // Iterate through arrays from largest to smallest, performing scan on each level
+            currentSize = N;
+            for (int i = 0; i < dev_arrays.size() - 1; ++i) {
+                dim3 blocksPerGrid = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernExclusiveScanByBlocks<<<blocksPerGrid, BLOCK_SIZE>>>(
+                    currentSize, ilog2(currentSize), dev_arrays[i], dev_arrays[i + 1]);
+                checkCUDAError("kernExclusiveScanByBlocks failed!");
+                cudaDeviceSynchronize();
+
+                currentSize = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
+            }
+
+            // Run scan on smallest block of all time
+            kernExclusiveScanOneBlock<<<1, BLOCK_SIZE>>>(currentSize, ilog2(currentSize),
+                                                         dev_smallest_array);
+            checkCUDAError("kernExclusiveScanOneBlock failed!");
+            cudaDeviceSynchronize();
+
+            // Iterate back up through arrays from smallest to largest, adding chunk sums
+            for (int i = dev_arrays.size() - 2; i >= 0; --i) {
+                currentSize *= BLOCK_SIZE;
+
+                dim3 blocksPerGrid = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE;
+                kernAddChunkedSums<<<blocksPerGrid, BLOCK_SIZE>>>(currentSize, dev_arrays[i],
+                                                                  dev_arrays[i + 1]);
+                checkCUDAError("kernAddChunkedSums failed!");
+                cudaDeviceSynchronize();
+            }
+
+            if (startGpuTimer)
+                timer().endGpuTimer();
+
+            // Copy result back to host
+            cudaMemcpy(odata, dev_arrays[0], n * sizeof(int), cudaMemcpyDeviceToHost);
+
+            // Deallocate all device pointers
+            for (int *devicePtr : dev_arrays) {
+                cudaFree(devicePtr);
+            }
         }
 
+        // by default: run GPU timer
+        void scan(int n, int *odata, const int *idata) { scan(n, odata, idata, true); }
+
         /**
          * Performs stream compaction on idata, storing the result into odata.
          * All zeroes are discarded.
@@ -31,10 +230,54 @@ namespace StreamCompaction {
          * @returns      The number of elements remaining after compaction.
          */
         int compact(int n, int *odata, const int *idata) {
+            dim3 blocksPerGrid = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+            int *dev_idata;
+            int *dev_bools;
+            int *dev_indices;
+            int *dev_odata;
+            cudaMalloc(&dev_idata, n * sizeof(int));
+            cudaMalloc(&dev_bools, n * sizeof(int));
+            cudaMalloc(&dev_indices, n * sizeof(int));
+            cudaMalloc(&dev_odata, n * sizeof(int));
+
+            // Copy stuffs over to GPU world
+            cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy from idata to dev_idata failed!");
+
             timer().startGpuTimer();
-            // TODO
+
+            // Map our input numbers to bools
+            Common::kernMapToBoolean<<<blocksPerGrid, BLOCK_SIZE>>>(n, dev_bools, dev_idata);
+            cudaDeviceSynchronize();
+            checkCUDAError("kernMapToBoolean failed!");
+
+            // Scan em into dev_indices!!!
+            Efficient::scan(n, dev_indices, dev_bools, false);
+
+            // Now scatter into the output
+            Common::kernScatter<<<blocksPerGrid, BLOCK_SIZE>>>(n, dev_odata, dev_idata, dev_bools,
+                                                               dev_indices);
+            cudaDeviceSynchronize();
+            checkCUDAError("kernScatter failed!");
+
             timer().endGpuTimer();
-            return -1;
+
+            cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy from dev_odata to odata failed!");
+
+            // Get the count of compacted elements
+            int lastBool, lastIndex;
+            cudaMemcpy(&lastBool, &dev_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+            int matchingCount = lastIndex + lastBool;
+
+            cudaFree(dev_idata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+            cudaFree(dev_odata);
+
+            return matchingCount;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.cuh
similarity index 72%
rename from stream_compaction/efficient.h
rename to stream_compaction/efficient.cuh
index 803cb4fe..dc38c1b7 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.cuh
@@ -1,11 +1,12 @@
 #pragma once
 
-#include "common.h"
+#include "common.cuh"
 
 namespace StreamCompaction {
     namespace Efficient {
         StreamCompaction::Common::PerformanceTimer& timer();
 
+        void scan(int n, int *odata, const int *idata, bool runGpuTimer);
         void scan(int n, int *odata, const int *idata);
 
         int compact(int n, int *odata, const int *idata);
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 43088769..9667e848 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -1,7 +1,9 @@
 #include <cuda.h>
 #include <cuda_runtime.h>
-#include "common.h"
-#include "naive.h"
+#include "common.cuh"
+#include "naive.cuh"
+
+#define BLOCK_SIZE 256
 
 namespace StreamCompaction {
     namespace Naive {
@@ -11,15 +13,77 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        __global__ void kernShiftForExclusiveScan(int N, int *dest, int *src) {
+            int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (threadIndex >= N)
+                return;
+
+            // Set up first element as zero, shift everything else right
+            if (threadIndex == 0) {
+                dest[0] = 0;
+            } else if (threadIndex < N) {
+                dest[threadIndex] = src[threadIndex - 1];
+            }
+        }
+
+        __global__ void kernScanIteration(int N, int offset, int *dest, int *src) {
+            int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+            if (threadIndex >= N)
+                return;
+
+            if (threadIndex >= offset) {
+                dest[threadIndex] = src[threadIndex] + src[threadIndex - offset];
+            } else {
+                dest[threadIndex] = src[threadIndex];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            dim3 blocksPerGrid = ((n + BLOCK_SIZE - 1) / BLOCK_SIZE);
+
+            int *dev_scanA;
+            int *dev_scanB;
+            cudaMalloc(&dev_scanA, sizeof(int) * n);
+            checkCUDAError("cudaMalloc dev_scanA failed!");
+            cudaMalloc(&dev_scanB, sizeof(int) * n);
+            checkCUDAError("cudaMalloc dev_scanB failed!");
+
+            // Send our input info over to the GPU
+            cudaMemcpy(dev_scanA, idata, sizeof(int) * n, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+
+            // Shift things over by one index
+            kernShiftForExclusiveScan<<<blocksPerGrid, BLOCK_SIZE>>>(n, dev_scanB, dev_scanA);
+            checkCUDAError("kernShiftForExclusiveScan failed!");
+            cudaDeviceSynchronize();
+
+            int *src = dev_scanB;
+            int *dest = dev_scanA;
+
+            // Run scan iteratively
+            for (int offset = 1; offset < n; offset *= 2) {
+                kernScanIteration<<<blocksPerGrid, BLOCK_SIZE>>>(n, offset, dest, src);
+                checkCUDAError("kernScanIteration failed!");
+                cudaDeviceSynchronize();
+
+                // Swap src and dest pointers for next iteration
+                int *temp = src;
+                src = dest;
+                dest = temp;
+            }
+
             timer().endGpuTimer();
+
+            // Take our output data back to the CPU
+            cudaMemcpy(odata, src, sizeof(int) * n, cudaMemcpyDeviceToHost);
+
+            cudaFree(dev_scanA);
+            cudaFree(dev_scanB);
         }
     }
 }
diff --git a/stream_compaction/naive.h b/stream_compaction/naive.cuh
similarity index 89%
rename from stream_compaction/naive.h
rename to stream_compaction/naive.cuh
index 37dcb064..fbcd394b 100644
--- a/stream_compaction/naive.h
+++ b/stream_compaction/naive.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "common.h"
+#include "common.cuh"
 
 namespace StreamCompaction {
     namespace Naive {
diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu
new file mode 100644
index 00000000..f1376cb5
--- /dev/null
+++ b/stream_compaction/radix.cu
@@ -0,0 +1,151 @@
+#include "common.cuh"
+#include "efficient.cuh"
+#include "radix.cuh"
+
+#include <algorithm>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#define BLOCK_SIZE 256
+
+namespace StreamCompaction {
+namespace Radix {
+
+using StreamCompaction::Common::PerformanceTimer;
+PerformanceTimer &timer() {
+    static PerformanceTimer timer;
+    return timer;
+}
+
+void cpu_sort(int n, int *odata, const int *idata) {
+    int *idata_mut = new int[n];
+    int *odata_mut = odata;
+
+    // copy
+    std::copy(idata, idata + n, idata_mut);
+
+    Radix::timer().startCpuTimer();
+
+    for (int bit = 0; bit < 32; ++bit) {
+        // for each bit, ping-pong
+        int currentIndex = 0;
+
+        for (int i = 0; i < n; ++i) {
+            int shouldPartitionRight = (idata_mut[i] >> bit) & 1;
+            if (shouldPartitionRight)
+                continue;
+
+            odata_mut[currentIndex] = idata_mut[i];
+            currentIndex++;
+        }
+
+        for (int i = 0; i < n; ++i) {
+            int shouldPartitionRight = (idata_mut[i] >> bit) & 1;
+            if (!shouldPartitionRight)
+                continue;
+
+            odata_mut[currentIndex] = idata_mut[i];
+            currentIndex++;
+        }
+
+        int *temp = idata_mut;
+        idata_mut = odata_mut;
+        odata_mut = temp;
+    }
+
+    // Copy final result to odata if pinged to the wrong pong
+    if (idata_mut != odata) {
+        std::copy(idata_mut, idata_mut + n, odata);
+    }
+
+    Radix::timer().endCpuTimer();
+
+    delete[] idata_mut;
+}
+
+/**
+ *  Runs custom scatter operation for radix sort partitioning.
+ *
+ * @param falseIndices  An exclusive scan of the `bools` array
+ * @param totalFalses   The total number of 1 bits in `bools`
+ */
+__global__ void kernCustomScatter(int n, int *odata, const int *idata, const int *falseBools,
+                                  const int *falseIndices, int totalFalses) {
+    int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (threadIndex >= n)
+        return;
+
+    int falseIndex = falseIndices[threadIndex];
+    int trueIndex = threadIndex - falseIndex + totalFalses;
+
+    int outputIndex = falseBools[threadIndex] ? falseIndex : trueIndex;
+    odata[outputIndex] = idata[threadIndex];
+}
+
+/**
+ * Performs radix sort on idata, storing the result into odata.
+ */
+void sort(int n, int *odata, const int *idata) {
+    dim3 blocksPerGrid = (n + BLOCK_SIZE - 1) / BLOCK_SIZE;
+
+    int *dev_input;
+    int *dev_output;
+    int *dev_falseBools;
+    int *dev_falseIndices;
+
+    cudaMalloc(&dev_input, n * sizeof(int));
+    cudaMalloc(&dev_output, n * sizeof(int));
+    cudaMalloc(&dev_falseBools, n * sizeof(int));
+    cudaMalloc(&dev_falseIndices, n * sizeof(int));
+
+    // Copy input data to device
+    cudaMemcpy(dev_input, idata, n * sizeof(int), cudaMemcpyHostToDevice);
+
+    timer().startGpuTimer();
+
+    // Process each bit position (32 bits for int)
+    for (int bit = 0; bit < 32; ++bit) {
+        // Map elements to boolean based on current bit
+        Common::kernMapToBit<<<blocksPerGrid, BLOCK_SIZE>>>(n, dev_falseBools, dev_input, bit);
+        cudaDeviceSynchronize();
+        checkCUDAError("kernMapToBit failed!");
+
+        // Flip all of our new booleans (this results in the f array!)
+        Common::kernInvert<<<blocksPerGrid, BLOCK_SIZE>>>(n, dev_falseBools);
+        cudaDeviceSynchronize();
+        checkCUDAError("kernInvert failed!");
+
+        // Scan the f array to get "false" scatter indices
+        Efficient::scan(n, dev_falseIndices, dev_falseBools, false);
+
+        // Grab total falses out of last array elements
+        int lastBool, lastFalseIndex;
+        cudaMemcpy(&lastBool, &dev_falseBools[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+        cudaMemcpy(&lastFalseIndex, &dev_falseIndices[n - 1], sizeof(int), cudaMemcpyDeviceToHost);
+        int totalFalses = lastBool + lastFalseIndex;
+
+        // Scatter elements based on bit value (0s first, then 1s)
+        Radix::kernCustomScatter<<<blocksPerGrid, BLOCK_SIZE>>>(
+            n, dev_output, dev_input, dev_falseBools, dev_falseIndices, totalFalses);
+        cudaDeviceSynchronize();
+        checkCUDAError("kernScatterRadix failed!");
+
+        // Swap input and output buffers for next iteration
+        int *temp = dev_input;
+        dev_input = dev_output;
+        dev_output = temp;
+    }
+
+    timer().endGpuTimer();
+
+    // Copy result back to host
+    cudaMemcpy(odata, dev_input, n * sizeof(int), cudaMemcpyDeviceToHost);
+
+    cudaFree(dev_input);
+    cudaFree(dev_output);
+    cudaFree(dev_falseBools);
+    cudaFree(dev_falseIndices);
+}
+
+} // namespace Radix
+} // namespace StreamCompaction
\ No newline at end of file
diff --git a/stream_compaction/radix.cuh b/stream_compaction/radix.cuh
new file mode 100644
index 00000000..a6e28b90
--- /dev/null
+++ b/stream_compaction/radix.cuh
@@ -0,0 +1,17 @@
+#pragma once
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include "common.cuh"
+
+namespace StreamCompaction {
+namespace Radix {
+
+StreamCompaction::Common::PerformanceTimer &timer();
+
+void cpu_sort(int n, int *odata, const int *idata);
+void sort(int n, int *odata, const int *idata);
+
+} // namespace Radix
+} // namespace StreamCompaction
\ No newline at end of file
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e7..d00cc013 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -3,8 +3,8 @@
 #include <thrust/device_vector.h>
 #include <thrust/host_vector.h>
 #include <thrust/scan.h>
-#include "common.h"
-#include "thrust.h"
+#include "common.cuh"
+#include "thrust.cuh"
 
 namespace StreamCompaction {
     namespace Thrust {
@@ -18,11 +18,20 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            // Copy data over to device vectors
+            thrust::device_vector<int> dv_in(idata, idata + n);
+            thrust::device_vector<int> dv_out(n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+
+            // Run epic builtin exclusive scan
+            thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            cudaDeviceSynchronize();
+
             timer().endGpuTimer();
+
+            // Copy output data back to host
+            thrust::copy(dv_out.begin(), dv_out.end(), odata);
         }
     }
 }
diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.cuh
similarity index 89%
rename from stream_compaction/thrust.h
rename to stream_compaction/thrust.cuh
index fe98206b..27f7bec8 100644
--- a/stream_compaction/thrust.h
+++ b/stream_compaction/thrust.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "common.h"
+#include "common.cuh"
 
 namespace StreamCompaction {
     namespace Thrust {