diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..c7a016af --- /dev/null +++ b/.clang-format @@ -0,0 +1,6 @@ +--- +Language: Cpp +IndentWidth: 4 +TabWidth: 4 +UseTab: Never +ColumnLimit: 100 diff --git a/.clangd b/.clangd new file mode 100644 index 00000000..8aefc26b --- /dev/null +++ b/.clangd @@ -0,0 +1,55 @@ +# Apply this config conditionally to all C files +If: + PathMatch: .*\.(c|h)$ +CompileFlags: + Compiler: /usr/bin/gcc + +--- + +# Apply this config conditionally to all C++ files +If: + PathMatch: .*\.(c|h)pp +CompileFlags: + Compiler: /usr/bin/g++ + +--- + +# Apply this config conditionally to all CUDA files +If: + PathMatch: .*\.cuh? +CompileFlags: + Compiler: /usr/local/cuda/bin/nvcc + +--- + +# Tweak the clangd parse settings for all files +CompileFlags: + Add: + # report all errors + - "-ferror-limit=0" + - "-I/usr/local/cuda/include/cccl" + Remove: + # strip CUDA fatbin args + - "-Xfatbin*" + # strip CUDA arch flags + - "-gencode*" + - "--generate-code*" + # strip CUDA flags unknown to clang + - "-ccbin*" + - "--compiler-options*" + - "--expt-extended-lambda" + - "--expt-relaxed-constexpr" + - "-forward-unknown-to-host-compiler" + - "-Werror=cross-execution-space-call" + - "-arch=native" + - "--options-file" + - "-G" + +Hover: + ShowAKA: No +InlayHints: + Enabled: No +Diagnostics: + Suppress: + - "variadic_device_fn" + - "attributes_not_allowed" diff --git a/.gitignore b/.gitignore index a59ec565..2cbb92eb 100644 --- a/.gitignore +++ b/.gitignore @@ -25,7 +25,8 @@ build .LSOverride # Icon must end with two \r -Icon +Icon + # Thumbnails ._* @@ -256,6 +257,11 @@ bld/ # Uncomment if you have tasks that create the project's static files in wwwroot #wwwroot/ +.vscode/* +!.vscode/launch.json +!.vscode/extensions.json +!.vscode/settings.json + # MSTest test Results [Tt]est[Rr]esult*/ [Bb]uild[Ll]og.* @@ -269,6 +275,9 @@ TestResult.xml [Rr]eleasePS/ dlldata.c +# Clangd cache +.cache/clangd + # DNX project.lock.json artifacts/ diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..d1ddfabd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,25 @@ +{ + "$schema": "vscode://schemas/launch", + // Use IntelliSense to learn about possible attributes. + // Hover to view descriptions of existing attributes. + // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387 + "version": "0.2.0", + "configurations": [ + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "environment": [ + {"name": "WAYLAND_DISPLAY", "value": ""}, + {"name": "XDG_SESSION_TYPE", "value": "x11"} + ], + "program": "${workspaceFolder}/build/bin/cis5650_stream_compaction_test", + "cwd": "${workspaceFolder}" + }, + { + "name": "CUDA C++: Attach", + "type": "cuda-gdb", + "request": "attach" + } + ] +} \ No newline at end of file diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 00000000..ce16cbc6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,15 @@ +{ + "files.associations": { + "*.cu": "cuda-cpp" + }, + "[cpp]": { + "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd" + }, + "[cuda-cpp]": { + "editor.defaultFormatter": "llvm-vs-code-extensions.vscode-clangd" + }, + "[python]": { + "editor.defaultFormatter": "charliermarsh.ruff", + "editor.tabSize": 4, + }, +} \ No newline at end of file diff --git a/README.md b/README.md index 0e38ddb1..ed343098 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,150 @@ CUDA Stream Compaction ====================== -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +![](img/graphs/scan_performance_nonpow2.png) -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +Q: (why is thrust so slow???) A: because I forgot to build binaries in release mode. -### (TODO: Your README) +**University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 2** -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* Thomas Shaw + * [LinkedIn](https://www.linkedin.com/in/thomas-shaw-54468b222), [personal website](https://tlshaw.me), [GitHub](https://github.com/printer83mph), etc. +* Tested on: Fedora 42, Ryzen 7 5700x @ 4.67GHz, 32GB, RTX 2070 8GB + +## Features + +- CUDA exclusive scan and compaction implementations! +- Work-efficient algorithm speeds up scan even more! + - Uses segmented blocks running in-place at the same time without worry of race condition, with arbitrary size input... can be adapted to use shared memory! +- Faster scan than CPU!! +- GPU Radix Sort implementation + - (tested against CPU version in `main`) +- Python [analysis module](./analysis/README.md) for spitting out nice CSV files measuring performance + + +## Performance Analysis + +Python scripts have been created in `analysis/` for easier stat collection. The [README](./analysis/README.md) within provides info on how to run these. + +### Block Size Optimizations + +Let's take a look at the resulting performance from block size choices, all normalized: + +![](img/graphs/block_sizes_all_normalized.png) + +Performance seems to fluctuate differently per-algorithm between possible block sizes from 64 and 1024. + +See below all the different algorithms independently: +| | | | +|---|---|---| +| ![](img/graphs/block_sizes_scan_naive.png) Naive Scan | ![](img/graphs/block_sizes_scan_work_efficient.png) Work Efficient Scan | ![](img/graphs/block_sizes_scan_thrust.png) Thrust Scan | +| | ![](img/graphs/block_sizes_compact.png) Stream Compaction (Work-Efficient Scan) | ![](img/graphs/block_sizes_radix.png) Radix | + +It seems that the optimal block size for this machine is somewhere between 128 and 512, but that really depends on the algorithm. + +### Scan Implementation Comparisons + +See below the performance change over different array sizes, when aligned with powers of 2, and when not. + +![](img/graphs/scan_performance_nonpow2.png) + +![](img/graphs/scan_performance_pow2.png) + +Our Work-Efficient GPU Scan outperforms the Naive one at almost all array sizes. This is to be expected. + +The CPU eventually falls behind the work-efficient solution. + +It seems that Thrust is generally much faster than our solutions. What black magic are they working? + +### NSight analysis + +![](img/graphs/nsight_work_efficient.png) + +The above is our timeline view in NSight Systems. + +![](img/graphs/nsight_thrust.png) + +And the above is thrust. + +It seems they perform the entire operation inside a single kernel. They likely utilize shared memory access patterns, or somehow improve order-of-operations when it comes to memory access and computations. + +### Test output at n = 2^27 + +``` +**************** +** SCAN TESTS ** +**************** + [ 45 9 27 23 1 22 41 22 38 25 35 19 49 ... 35 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 195.421ms (std::chrono Measured) + [ 0 45 54 81 104 105 127 168 190 228 253 288 307 ... -1006580612 -1006580577 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 195.83ms (std::chrono Measured) + [ 0 45 54 81 104 105 127 168 190 228 253 288 307 ... -1006580641 -1006580637 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 178.568ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 160.998ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 151.997ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 155.299ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1000.16ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 999.244ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 1 2 3 3 0 3 3 2 0 2 0 0 0 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 342.001ms (std::chrono Measured) + [ 1 2 3 3 3 3 2 2 3 1 3 2 3 ... 2 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 342.025ms (std::chrono Measured) + [ 1 2 3 3 3 3 2 2 3 1 3 2 3 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 1084.35ms (std::chrono Measured) + [ 1 2 3 3 3 3 2 2 3 1 3 2 3 ... 2 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 218.355ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 170.299ms (CUDA Measured) + passed + +***************************** +** RADIX SORT TESTS ** +***************************** + [ 26 47 38 48 23 28 48 10 6 9 30 37 21 ... 17 0 ] +==== cpu sort, power-of-two ==== + elapsed time: 17055.4ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== cpu sort, non-power-of-two ==== + elapsed time: 17048.5ms (std::chrono Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] +==== gpu radix sort, power-of-two ==== + elapsed time: 5650.03ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + passed +==== gpu radix sort, non-power-of-two ==== + elapsed time: 5632.99ms (CUDA Measured) + [ 0 0 0 0 0 0 0 0 0 0 0 0 0 ... 49 49 ] + passed +``` + +### Radix Sort GPU implementation + +As seen above in the above output, the GPU Radix sort implementation is significantly faster than the CPU implementation at larger n values. For further research, it should be compared with Quicksort, or some other famous CPU sorting algorithm. diff --git a/analysis/.gitignore b/analysis/.gitignore new file mode 100644 index 00000000..e1ebabf6 --- /dev/null +++ b/analysis/.gitignore @@ -0,0 +1,217 @@ + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[codz] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py.cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +# Pipfile.lock + +# UV +# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# uv.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +# poetry.lock +# poetry.toml + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python. +# https://pdm-project.org/en/latest/usage/project/#working-with-version-control +# pdm.lock +# pdm.toml +.pdm-python +.pdm-build/ + +# pixi +# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control. +# pixi.lock +# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one +# in the .venv directory. It is recommended not to include this directory in version control. +.pixi + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# Redis +*.rdb +*.aof +*.pid + +# RabbitMQ +mnesia/ +rabbitmq/ +rabbitmq-data/ + +# ActiveMQ +activemq-data/ + +# SageMath parsed files +*.sage.py + +# Environments +.env +.envrc +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +# .idea/ + +# Abstra +# Abstra is an AI-powered process automation framework. +# Ignore directories containing user credentials, local state, and settings. +# Learn more at https://abstra.io/docs +.abstra/ + +# Visual Studio Code +# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore +# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore +# and can be added to the global gitignore or merged into this file. However, if you prefer, +# you could uncomment the following to ignore the entire vscode folder +# .vscode/ + +# Ruff stuff: +.ruff_cache/ + +# PyPI configuration file +.pypirc + +# Marimo +marimo/_static/ +marimo/_lsp/ +__marimo__/ + +# Streamlit +.streamlit/secrets.toml diff --git a/analysis/.python-version b/analysis/.python-version new file mode 100644 index 00000000..24ee5b1b --- /dev/null +++ b/analysis/.python-version @@ -0,0 +1 @@ +3.13 diff --git a/analysis/.temp/.gitignore b/analysis/.temp/.gitignore new file mode 100644 index 00000000..c96a04f0 --- /dev/null +++ b/analysis/.temp/.gitignore @@ -0,0 +1,2 @@ +* +!.gitignore \ No newline at end of file diff --git a/analysis/README.md b/analysis/README.md new file mode 100644 index 00000000..7e85b5e9 --- /dev/null +++ b/analysis/README.md @@ -0,0 +1,24 @@ +# Performance Analysis + +## Usage + +To get set up, first install `uv`. Sync dependencies with `uv sync`. +Ensure that your build is also set up properly in the `build` folder. + +Then, to run a block size comparisons, with `analysis` as working directory, run: + +```sh +uv run src/main.py block-sizes +``` + +Or, alternatively, for scan algorithm comparisons: + +```sh +uv run src/main.py scan-comparison +``` + +## Behavior + +This module works by temporarily editing constants in source files, building, and running the executable. It modifies constants like `#define TEST_SCAN 1` to enable or disable testing certain features, and modifies others like `#define BLOCK_SIZE 256` to tweak performance. + +It is built to be extensible with more tests. \ No newline at end of file diff --git a/analysis/pyproject.toml b/analysis/pyproject.toml new file mode 100644 index 00000000..3fef754d --- /dev/null +++ b/analysis/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "analysis" +version = "0.1.0" +description = "Script module for automated performance analysis" +readme = "../README.md" +requires-python = ">=3.13" +dependencies = [] + +[dependency-groups] +dev = [ + "ruff>=0.13.0", +] diff --git a/analysis/reports/block_sizes.csv b/analysis/reports/block_sizes.csv new file mode 100644 index 00000000..97f6080a --- /dev/null +++ b/analysis/reports/block_sizes.csv @@ -0,0 +1,7 @@ +Block Size,naive scan power-of-two,naive scan non-power-of-two,work-efficient scan power-of-two,work-efficient scan non-power-of-two,thrust scan power-of-two,thrust scan non-power-of-two,work-efficient compact power-of-two,work-efficient compact non-power-of-two,gpu radix sort power-of-two,gpu radix sort non-power-of-two +32,27.1336,29.3257,8.2609,8.15974,0.495776,0.458752,8.42829,8.41302,289.301,260.37 +64,9.58538,9.53488,5.1136,6.06589,0.508704,0.467456,7.1607,7.08707,274.922,241.055 +128,9.51766,9.53098,5.11418,6.37702,0.516,0.462784,7.1904,7.15181,284.143,241.419 +256,9.58464,9.5049,5.12038,6.18496,0.505568,0.470464,7.3313,7.11728,265.27,243.207 +512,9.5625,9.50042,5.13203,5.0752,0.529312,0.47776,7.17606,7.15402,279.336,242.66 +1024,9.67619,9.66211,5.15446,5.69139,0.520192,0.47696,7.20762,7.07779,261.759,242.782 diff --git a/analysis/reports/scan_comparison.csv b/analysis/reports/scan_comparison.csv new file mode 100644 index 00000000..2a5ea867 --- /dev/null +++ b/analysis/reports/scan_comparison.csv @@ -0,0 +1,13 @@ +Array Size,cpu scan power-of-two,cpu scan non-power-of-two,naive scan power-of-two,naive scan non-power-of-two,work-efficient scan power-of-two,work-efficient scan non-power-of-two,thrust scan power-of-two,thrust scan non-power-of-two +262144,0.095401,0.072708,0.208576,0.164832,0.168544,0.114688,0.134624,0.098784 +524288,0.252088,0.143222,0.288,0.252032,0.240096,0.19872,0.135648,0.09872 +1048576,0.44262,0.298326,0.618432,0.57776,0.376,0.337472,0.152736,0.12288 +2097152,0.874619,0.703374,1.17187,1.15363,0.802208,0.765952,0.180736,0.141792 +4194304,1.75208,1.65104,2.292,2.23654,1.43482,1.38976,0.223584,0.183936 +8388608,4.0572,3.81021,4.7168,4.67971,3.24781,3.1889,0.313536,0.270432 +16777216,9.00436,7.46798,9.77485,9.78221,6.61827,6.56816,0.507904,0.463072 +33554432,27.2172,14.6883,20.2924,20.2419,13.5155,13.4656,0.83968,0.829344 +67108864,29.4156,29.1289,42.564,41.746,30.3128,26.5994,1.84298,1.52134 +134217728,58.0058,57.9033,88.6921,87.4699,45.1035,44.8515,3.00957,2.92611 +268435456,119.227,116.488,184.853,183.239,92.9893,92.4521,5.71802,5.76416 +536870912,237.653,228.767,381.553,381.138,192.289,190.455,11.3378,11.3172 diff --git a/analysis/src/analyze.py b/analysis/src/analyze.py new file mode 100644 index 00000000..bc1ef389 --- /dev/null +++ b/analysis/src/analyze.py @@ -0,0 +1,97 @@ +from collections import defaultdict + +import helpers +from pathlib import Path + + +def test_optimize_block_sizes(): + block_size_and_runtime_by_algorithm: defaultdict[str, list[tuple[int, int]]] = ( + defaultdict(list) + ) + + for block_size_exp in range(5, 11): + block_size = pow(2, block_size_exp) + + results = helpers.test_with_params( + filename="temp", + parameters=helpers.BuildParameters( + enable_scan=True, + enable_compact=True, + enable_radix=True, + block_size=block_size, + array_size_pow=24, + ), + ) + + for algorithm, runtime in results: + block_size_and_runtime_by_algorithm[algorithm].append((block_size, runtime)) + + # Print CSV header + algorithms = list(block_size_and_runtime_by_algorithm.keys()) + algorithms = [alg for alg in algorithms if "cpu" not in alg.lower()] + + # Get all block sizes (assuming they're the same for all algorithms) + block_sizes = [ + pair[0] for pair in block_size_and_runtime_by_algorithm[algorithms[0]] + ] + + # Create reports directory if it doesn't exist + output_path = Path.cwd() / "reports/block_sizes.csv" + output_path.parent.mkdir(exist_ok=True) + + with open(output_path, "w") as f: + # Write CSV header + f.write("Block Size," + ",".join(algorithms) + "\n") + + # Print each row + for i, block_size in enumerate(block_sizes): + row = [str(block_size)] + for algorithm in algorithms: + runtime = block_size_and_runtime_by_algorithm[algorithm][i][1] + row.append(str(runtime)) + f.write(",".join(row) + "\n") + + +def compare_scan_implementations(): + array_size_and_runtime_by_algorithm: defaultdict[str, list[tuple[int, int]]] = ( + defaultdict(list) + ) + + for array_size_exp in range(18, 30): + array_size = pow(2, array_size_exp) + + results = helpers.test_with_params( + filename="temp", + parameters=helpers.BuildParameters( + enable_scan=True, + enable_compact=False, + enable_radix=False, + block_size=128, + array_size_pow=array_size_exp, + ), + ) + + for algorithm, runtime in results: + array_size_and_runtime_by_algorithm[algorithm].append((array_size, runtime)) + + # Create reports directory if it doesn't exist + output_path = Path.cwd() / "reports/scan_comparison.csv" + output_path.parent.mkdir(exist_ok=True) + + # Get algorithms and array sizes + algorithms = list(array_size_and_runtime_by_algorithm.keys()) + array_sizes = [ + pair[0] for pair in array_size_and_runtime_by_algorithm[algorithms[0]] + ] + + with open(output_path, "w") as f: + # Write CSV header + f.write("Array Size," + ",".join(algorithms) + "\n") + + # Write each row + for i, array_size in enumerate(array_sizes): + row = [str(array_size)] + for algorithm in algorithms: + runtime = array_size_and_runtime_by_algorithm[algorithm][i][1] + row.append(str(runtime)) + f.write(",".join(row) + "\n") diff --git a/analysis/src/helpers.py b/analysis/src/helpers.py new file mode 100644 index 00000000..aa487fdb --- /dev/null +++ b/analysis/src/helpers.py @@ -0,0 +1,106 @@ +from contextlib import contextmanager +from dataclasses import dataclass +from pathlib import Path +import subprocess +from typing import Iterator + +root_dir = Path.cwd().parent + +DEFAULT_BLOCK_SIZE = 256 + + +@dataclass +class BuildParameters: + enable_scan: bool = False + enable_compact: bool = False + enable_radix: bool = False + block_size: int = 256 + array_size_pow: int = 24 + + +BASE_PARAMETERS = BuildParameters( + enable_scan=True, + enable_compact=True, + enable_radix=True, + block_size=256, + array_size_pow=8, +) + + +def set_build_params(parameters: BuildParameters) -> None: + with open(root_dir / "src" / "main.cpp", "r+") as main_file: + content = main_file.read() + main_file.seek(0) + lines = content.split("\n") + for i, line in enumerate(lines): + if line.strip().startswith("#define TEST_SCAN"): + lines[i] = f"#define TEST_SCAN {1 if parameters.enable_scan else 0}" + elif line.strip().startswith("#define TEST_COMPACT"): + lines[i] = ( + f"#define TEST_COMPACT {1 if parameters.enable_compact else 0}" + ) + elif line.strip().startswith("#define TEST_RADIX"): + lines[i] = f"#define TEST_RADIX {1 if parameters.enable_radix else 0}" + elif line.strip().startswith("const int SIZE ="): + lines[i] = f"const int SIZE = 1 << {parameters.array_size_pow}; // feel free to change the size of array" + main_file.write("\n".join(lines)) + main_file.truncate() + + # Update BLOCK_SIZE wherever it's defined + for filename in ("naive.cu", "radix.cu"): + filepath = root_dir / "stream_compaction" / filename + with open(filepath, "r+") as naive_file: + content = naive_file.read() + naive_file.seek(0) + lines = content.split("\n") + for i, line in enumerate(lines): + if line.strip().startswith("#define BLOCK_SIZE"): + lines[i] = f"#define BLOCK_SIZE {parameters.block_size}" + naive_file.write("\n".join(lines)) + naive_file.truncate() + + +@contextmanager +def use_build_params(parameters: BuildParameters) -> Iterator[None]: + set_build_params(parameters) + yield + set_build_params(BASE_PARAMETERS) + + +def run_test_pipe_to_file(filename: str): + with open(root_dir / "analysis/.temp" / f"{filename}.log", "w") as log_file: + subprocess.call( + "./bin/cis5650_stream_compaction_test", + cwd=root_dir / "build", + stdout=log_file, + stderr=log_file, + ) + + +def parse_test_results(filename: str) -> list[tuple[str, float]]: + log_path = root_dir / "analysis/.temp" / f"{filename}.log" + results = [] + + with open(log_path, "r") as log_file: + for line in log_file: + line = line.strip() + if line.startswith("====") and line.endswith("===="): + method_name = line[4:-4].strip() + method_name = method_name.replace(",", "") + elif line.startswith("elapsed time:") and "ms" in line: + # Extract time from format "elapsed time: 0.174592ms" + time_part = line.split("elapsed time:")[1].strip() + time_str = time_part.split("ms")[0].strip() + elapsed_time = float(time_str) + results.append((method_name, elapsed_time)) + + return results + + +def test_with_params( + *, filename: str, parameters: BuildParameters +) -> list[tuple[str, float]]: + with use_build_params(parameters): + subprocess.call(["cmake", "--build", "."], cwd=root_dir / "build") + run_test_pipe_to_file(filename) + return parse_test_results(filename) diff --git a/analysis/src/main.py b/analysis/src/main.py new file mode 100644 index 00000000..91f8d29b --- /dev/null +++ b/analysis/src/main.py @@ -0,0 +1,19 @@ +import sys +import analyze + + +def main(): + match sys.argv[1].lower(): + case "block-sizes": + print("running block size optimization analysis...") + analyze.test_optimize_block_sizes() + print("done! results stored to reports/block_sizes.csv.") + + case "scan-comparison": + print("running scan implementation comparison analysis...") + analyze.compare_scan_implementations() + print("done! results stored to reports/scan_comparison.csv.") + + +if __name__ == "__main__": + main() diff --git a/analysis/uv.lock b/analysis/uv.lock new file mode 100644 index 00000000..6e36b8d0 --- /dev/null +++ b/analysis/uv.lock @@ -0,0 +1,44 @@ +version = 1 +revision = 3 +requires-python = ">=3.13" + +[[package]] +name = "analysis" +version = "0.1.0" +source = { virtual = "." } + +[package.dev-dependencies] +dev = [ + { name = "ruff" }, +] + +[package.metadata] + +[package.metadata.requires-dev] +dev = [{ name = "ruff", specifier = ">=0.13.0" }] + +[[package]] +name = "ruff" +version = "0.13.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/6e/1a/1f4b722862840295bcaba8c9e5261572347509548faaa99b2d57ee7bfe6a/ruff-0.13.0.tar.gz", hash = "sha256:5b4b1ee7eb35afae128ab94459b13b2baaed282b1fb0f472a73c82c996c8ae60", size = 5372863, upload-time = "2025-09-10T16:25:37.917Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/ac/fe/6f87b419dbe166fd30a991390221f14c5b68946f389ea07913e1719741e0/ruff-0.13.0-py3-none-linux_armv6l.whl", hash = "sha256:137f3d65d58ee828ae136a12d1dc33d992773d8f7644bc6b82714570f31b2004", size = 12187826, upload-time = "2025-09-10T16:24:39.5Z" }, + { url = "https://files.pythonhosted.org/packages/e4/25/c92296b1fc36d2499e12b74a3fdb230f77af7bdf048fad7b0a62e94ed56a/ruff-0.13.0-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:21ae48151b66e71fd111b7d79f9ad358814ed58c339631450c66a4be33cc28b9", size = 12933428, upload-time = "2025-09-10T16:24:43.866Z" }, + { url = "https://files.pythonhosted.org/packages/44/cf/40bc7221a949470307d9c35b4ef5810c294e6cfa3caafb57d882731a9f42/ruff-0.13.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:64de45f4ca5441209e41742d527944635a05a6e7c05798904f39c85bafa819e3", size = 12095543, upload-time = "2025-09-10T16:24:46.638Z" }, + { url = "https://files.pythonhosted.org/packages/f1/03/8b5ff2a211efb68c63a1d03d157e924997ada87d01bebffbd13a0f3fcdeb/ruff-0.13.0-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2b2c653ae9b9d46e0ef62fc6fbf5b979bda20a0b1d2b22f8f7eb0cde9f4963b8", size = 12312489, upload-time = "2025-09-10T16:24:49.556Z" }, + { url = "https://files.pythonhosted.org/packages/37/fc/2336ef6d5e9c8d8ea8305c5f91e767d795cd4fc171a6d97ef38a5302dadc/ruff-0.13.0-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:4cec632534332062bc9eb5884a267b689085a1afea9801bf94e3ba7498a2d207", size = 11991631, upload-time = "2025-09-10T16:24:53.439Z" }, + { url = "https://files.pythonhosted.org/packages/39/7f/f6d574d100fca83d32637d7f5541bea2f5e473c40020bbc7fc4a4d5b7294/ruff-0.13.0-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dcd628101d9f7d122e120ac7c17e0a0f468b19bc925501dbe03c1cb7f5415b24", size = 13720602, upload-time = "2025-09-10T16:24:56.392Z" }, + { url = "https://files.pythonhosted.org/packages/fd/c8/a8a5b81d8729b5d1f663348d11e2a9d65a7a9bd3c399763b1a51c72be1ce/ruff-0.13.0-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:afe37db8e1466acb173bb2a39ca92df00570e0fd7c94c72d87b51b21bb63efea", size = 14697751, upload-time = "2025-09-10T16:24:59.89Z" }, + { url = "https://files.pythonhosted.org/packages/57/f5/183ec292272ce7ec5e882aea74937f7288e88ecb500198b832c24debc6d3/ruff-0.13.0-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0f96a8d90bb258d7d3358b372905fe7333aaacf6c39e2408b9f8ba181f4b6ef2", size = 14095317, upload-time = "2025-09-10T16:25:03.025Z" }, + { url = "https://files.pythonhosted.org/packages/9f/8d/7f9771c971724701af7926c14dab31754e7b303d127b0d3f01116faef456/ruff-0.13.0-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:94b5e3d883e4f924c5298e3f2ee0f3085819c14f68d1e5b6715597681433f153", size = 13144418, upload-time = "2025-09-10T16:25:06.272Z" }, + { url = "https://files.pythonhosted.org/packages/a8/a6/7985ad1778e60922d4bef546688cd8a25822c58873e9ff30189cfe5dc4ab/ruff-0.13.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:03447f3d18479df3d24917a92d768a89f873a7181a064858ea90a804a7538991", size = 13370843, upload-time = "2025-09-10T16:25:09.965Z" }, + { url = "https://files.pythonhosted.org/packages/64/1c/bafdd5a7a05a50cc51d9f5711da704942d8dd62df3d8c70c311e98ce9f8a/ruff-0.13.0-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:fbc6b1934eb1c0033da427c805e27d164bb713f8e273a024a7e86176d7f462cf", size = 13321891, upload-time = "2025-09-10T16:25:12.969Z" }, + { url = "https://files.pythonhosted.org/packages/bc/3e/7817f989cb9725ef7e8d2cee74186bf90555279e119de50c750c4b7a72fe/ruff-0.13.0-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:a8ab6a3e03665d39d4a25ee199d207a488724f022db0e1fe4002968abdb8001b", size = 12119119, upload-time = "2025-09-10T16:25:16.621Z" }, + { url = "https://files.pythonhosted.org/packages/58/07/9df080742e8d1080e60c426dce6e96a8faf9a371e2ce22eef662e3839c95/ruff-0.13.0-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d2a5c62f8ccc6dd2fe259917482de7275cecc86141ee10432727c4816235bc41", size = 11961594, upload-time = "2025-09-10T16:25:19.49Z" }, + { url = "https://files.pythonhosted.org/packages/6a/f4/ae1185349197d26a2316840cb4d6c3fba61d4ac36ed728bf0228b222d71f/ruff-0.13.0-py3-none-musllinux_1_2_i686.whl", hash = "sha256:b7b85ca27aeeb1ab421bc787009831cffe6048faae08ad80867edab9f2760945", size = 12933377, upload-time = "2025-09-10T16:25:22.371Z" }, + { url = "https://files.pythonhosted.org/packages/b6/39/e776c10a3b349fc8209a905bfb327831d7516f6058339a613a8d2aaecacd/ruff-0.13.0-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:79ea0c44a3032af768cabfd9616e44c24303af49d633b43e3a5096e009ebe823", size = 13418555, upload-time = "2025-09-10T16:25:25.681Z" }, + { url = "https://files.pythonhosted.org/packages/46/09/dca8df3d48e8b3f4202bf20b1658898e74b6442ac835bfe2c1816d926697/ruff-0.13.0-py3-none-win32.whl", hash = "sha256:4e473e8f0e6a04e4113f2e1de12a5039579892329ecc49958424e5568ef4f768", size = 12141613, upload-time = "2025-09-10T16:25:28.664Z" }, + { url = "https://files.pythonhosted.org/packages/61/21/0647eb71ed99b888ad50e44d8ec65d7148babc0e242d531a499a0bbcda5f/ruff-0.13.0-py3-none-win_amd64.whl", hash = "sha256:48e5c25c7a3713eea9ce755995767f4dcd1b0b9599b638b12946e892123d1efb", size = 13258250, upload-time = "2025-09-10T16:25:31.773Z" }, + { url = "https://files.pythonhosted.org/packages/e1/a3/03216a6a86c706df54422612981fb0f9041dbb452c3401501d4a22b942c9/ruff-0.13.0-py3-none-win_arm64.whl", hash = "sha256:ab80525317b1e1d38614addec8ac954f1b3e662de9d59114ecbf771d00cf613e", size = 12312357, upload-time = "2025-09-10T16:25:35.595Z" }, +] diff --git a/img/graphs/block_sizes_all_normalized.png b/img/graphs/block_sizes_all_normalized.png new file mode 100644 index 00000000..d4601b4f Binary files /dev/null and b/img/graphs/block_sizes_all_normalized.png differ diff --git a/img/graphs/block_sizes_compact.png b/img/graphs/block_sizes_compact.png new file mode 100644 index 00000000..0c61184c Binary files /dev/null and b/img/graphs/block_sizes_compact.png differ diff --git a/img/graphs/block_sizes_radix.png b/img/graphs/block_sizes_radix.png new file mode 100644 index 00000000..6f072fc6 Binary files /dev/null and b/img/graphs/block_sizes_radix.png differ diff --git a/img/graphs/block_sizes_scan_naive.png b/img/graphs/block_sizes_scan_naive.png new file mode 100644 index 00000000..f821b744 Binary files /dev/null and b/img/graphs/block_sizes_scan_naive.png differ diff --git a/img/graphs/block_sizes_scan_thrust.png b/img/graphs/block_sizes_scan_thrust.png new file mode 100644 index 00000000..99505e79 Binary files /dev/null and b/img/graphs/block_sizes_scan_thrust.png differ diff --git a/img/graphs/block_sizes_scan_work_efficient.png b/img/graphs/block_sizes_scan_work_efficient.png new file mode 100644 index 00000000..d076808c Binary files /dev/null and b/img/graphs/block_sizes_scan_work_efficient.png differ diff --git a/img/graphs/nsight_thrust.png b/img/graphs/nsight_thrust.png new file mode 100644 index 00000000..c033b534 Binary files /dev/null and b/img/graphs/nsight_thrust.png differ diff --git a/img/graphs/nsight_work_efficient.png b/img/graphs/nsight_work_efficient.png new file mode 100644 index 00000000..d6337378 Binary files /dev/null and b/img/graphs/nsight_work_efficient.png differ diff --git a/img/graphs/scan_performance_nonpow2.png b/img/graphs/scan_performance_nonpow2.png new file mode 100644 index 00000000..2b5e27f6 Binary files /dev/null and b/img/graphs/scan_performance_nonpow2.png differ diff --git a/img/graphs/scan_performance_pow2.png b/img/graphs/scan_performance_pow2.png new file mode 100644 index 00000000..327652f2 Binary files /dev/null and b/img/graphs/scan_performance_pow2.png differ diff --git a/src/main.cpp b/src/main.cpp index 3d5c8820..cb3e4758 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -6,94 +6,106 @@ * @copyright University of Pennsylvania */ -#include -#include -#include -#include -#include #include "testing_helpers.hpp" - -const int SIZE = 1 << 8; // feel free to change the size of array +#include +#include +#include +#include +#include +#include + +// Control which algorithms to test! +#define TEST_SCAN 1 +#define TEST_COMPACT 1 +#define TEST_RADIX 1 + +const int SIZE = 1 << 8; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; +int *sourceData = new int[SIZE]; +int *referenceResult = new int[SIZE]; +int *referenceResultNPOT = new int[SIZE]; +int *result = new int[SIZE]; -int main(int argc, char* argv[]) { +int main(int argresult, char *argv[]) { // Scan tests +#if TEST_SCAN + printf("\n"); printf("****************\n"); printf("** SCAN TESTS **\n"); printf("****************\n"); - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(SIZE - 1, sourceData, 50); // Leave a 0 at the end to test that edge case + sourceData[SIZE - 1] = 0; + printArray(SIZE, sourceData, true); // initialize b using StreamCompaction::CPU::scan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); + zeroArray(SIZE, referenceResult); printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); + StreamCompaction::CPU::scan(SIZE, referenceResult, sourceData); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); + printArray(SIZE, referenceResult, true); - zeroArray(SIZE, c); + zeroArray(SIZE, referenceResultNPOT); printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); + StreamCompaction::CPU::scan(NPOT, referenceResultNPOT, sourceData); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + printArray(NPOT, referenceResultNPOT, true); + printCmpResult(NPOT, referenceResultNPOT, referenceResultNPOT); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); + StreamCompaction::Naive::scan(SIZE, result, sourceData); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + // printArray(SIZE, result, true); + printCmpResult(SIZE, referenceResult, result); /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan onesArray(SIZE, c); printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ + StreamCompaction::Naive::scan(SIZE, result, a); + printArray(SIZE, result, true); */ - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); + StreamCompaction::Naive::scan(NPOT, result, sourceData); printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); + // printArray(SIZE, result, true); + printCmpResult(NPOT, referenceResultNPOT, result); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); + StreamCompaction::Efficient::scan(SIZE, result, sourceData); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + // printArray(SIZE, result, true); + printCmpResult(SIZE, referenceResult, result); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); + StreamCompaction::Efficient::scan(NPOT, result, sourceData); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + // printArray(NPOT, result, true); + printCmpResult(NPOT, referenceResultNPOT, result); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); + StreamCompaction::Thrust::scan(SIZE, result, sourceData); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); + // printArray(SIZE, result, true); + printCmpResult(SIZE, referenceResult, result); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); + StreamCompaction::Thrust::scan(NPOT, result, sourceData); printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); + // printArray(NPOT, result, true); + printCmpResult(NPOT, referenceResultNPOT, result); + +#endif +#if TEST_COMPACT printf("\n"); printf("*****************************\n"); @@ -102,53 +114,100 @@ int main(int argc, char* argv[]) { // Compaction tests - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); + genArray(SIZE - 1, sourceData, 4); // Leave a 0 at the end to test that edge case + sourceData[SIZE - 1] = 0; + printArray(SIZE, sourceData, true); int count, expectedCount, expectedNPOT; // initialize b using StreamCompaction::CPU::compactWithoutScan you implement // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); + zeroArray(SIZE, referenceResult); printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); + count = StreamCompaction::CPU::compactWithoutScan(SIZE, referenceResult, sourceData); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); + printArray(count, referenceResult, true); + printCmpLenResult(count, expectedCount, referenceResult, referenceResult); - zeroArray(SIZE, c); + zeroArray(SIZE, referenceResultNPOT); printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); + count = StreamCompaction::CPU::compactWithoutScan(NPOT, referenceResultNPOT, sourceData); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + printArray(count, referenceResultNPOT, true); + printCmpLenResult(count, expectedNPOT, referenceResultNPOT, referenceResultNPOT); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); + count = StreamCompaction::CPU::compactWithScan(SIZE, result, sourceData); printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + printArray(count, result, true); + printCmpLenResult(count, expectedCount, referenceResult, result); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); + count = StreamCompaction::Efficient::compact(SIZE, result, sourceData); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); + // printArray(count, result, true); + printCmpLenResult(count, expectedCount, referenceResult, result); - zeroArray(SIZE, c); + zeroArray(SIZE, result); printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); + count = StreamCompaction::Efficient::compact(NPOT, result, sourceData); printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); + // printArray(count, result, true); + printCmpLenResult(count, expectedNPOT, referenceResultNPOT, result); + +#endif +#if TEST_RADIX + + printf("\n"); + printf("*****************************\n"); + printf("** RADIX SORT TESTS **\n"); + printf("*****************************\n"); + genArray(SIZE - 1, sourceData, 50); // Leave a 0 at the end to test that edge case + // a[SIZE - 1] = 0; + printArray(SIZE, sourceData, true); + + zeroArray(SIZE, referenceResult); + printDesc("cpu sort, power-of-two"); + StreamCompaction::Radix::cpu_sort(SIZE, referenceResult, sourceData); + printElapsedTime(StreamCompaction::Radix::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + printArray(SIZE, referenceResult, true); + + zeroArray(NPOT, result); + printDesc("cpu sort, non-power-of-two"); + StreamCompaction::Radix::cpu_sort(NPOT, referenceResultNPOT, sourceData); + printElapsedTime(StreamCompaction::Radix::timer().getCpuElapsedTimeForPreviousOperation(), + "(std::chrono Measured)"); + printArray(NPOT, referenceResultNPOT, true); + + zeroArray(SIZE, result); + printDesc("gpu radix sort, power-of-two"); + StreamCompaction::Radix::sort(SIZE, result, sourceData); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + printArray(SIZE, result, true); + printCmpResult(SIZE, referenceResult, result); + + zeroArray(NPOT, result); + printDesc("gpu radix sort, non-power-of-two"); + StreamCompaction::Radix::sort(NPOT, result, sourceData); + printElapsedTime(StreamCompaction::Radix::timer().getGpuElapsedTimeForPreviousOperation(), + "(CUDA Measured)"); + printArray(NPOT, result, true); + printCmpResult(NPOT, referenceResultNPOT, result); + +#endif + +#ifdef _WIN32 system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; +#endif + delete[] sourceData; + delete[] referenceResult; + delete[] referenceResultNPOT; + delete[] result; } diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..3be7f7c8 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,9 +1,10 @@ set(headers - "common.h" - "cpu.h" - "naive.h" - "efficient.h" - "thrust.h" + "common.cuh" + "cpu.cuh" + "naive.cuh" + "efficient.cuh" + "radix.cuh" + "thrust.cuh" ) set(sources @@ -11,6 +12,7 @@ set(sources "cpu.cu" "naive.cu" "efficient.cu" + "radix.cu" "thrust.cu" ) diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..dca2e92e 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,4 +1,4 @@ -#include "common.h" +#include "common.cuh" void checkCUDAErrorFn(const char *msg, const char *file, int line) { cudaError_t err = cudaGetLastError(); @@ -23,7 +23,12 @@ namespace StreamCompaction { * which map to 0 will be removed, and elements which map to 1 will be kept. */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= n) + return; + + // Cast to boolean to check against 0 + bools[threadIndex] = (bool)idata[threadIndex]; } /** @@ -32,8 +37,37 @@ namespace StreamCompaction { */ __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { - // TODO + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= n) + return; + + if (bools[threadIndex]) { + odata[indices[threadIndex]] = idata[threadIndex]; + } } + /** + * Similar to kernMapToBoolean, but the result (array of bools) is built from the kth bits + * of input array elements. Bits are 0-indexed, i.e. kth power of 2. + */ + __global__ void kernMapToBit(int n, int *bools, const int *idata, int bit) { + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= n) + return; + + int bitValue = (idata[threadIndex] >> bit) & 1; + bools[threadIndex] = bitValue; + } + + /** + * Dead simple: inverts all elements of input array into output array. + */ + __global__ void kernInvert(int n, int *bools) { + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= n) + return; + + bools[threadIndex] = bools[threadIndex] ^ 1; + } } } diff --git a/stream_compaction/common.h b/stream_compaction/common.cuh similarity index 96% rename from stream_compaction/common.h rename to stream_compaction/common.cuh index d2c1fed9..0b333c38 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.cuh @@ -3,11 +3,9 @@ #include #include +#include #include #include -#include -#include -#include #include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) @@ -37,6 +35,10 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices); + __global__ void kernMapToBit(int n, int *bools, const int *idata, int bit); + + __global__ void kernInvert(int n, int *bools); + /** * This class is used for timing the performance * Uncopyable and unmovable diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..6514e71e 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,7 +1,6 @@ -#include -#include "cpu.h" +#include "cpu.cuh" -#include "common.h" +#include "common.cuh" namespace StreamCompaction { namespace CPU { @@ -19,7 +18,13 @@ namespace StreamCompaction { */ void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + // Ye olde iterative implementation + odata[0] = 0; + for (int i = 0; i < n; ++i) { + odata[i] = idata[i - 1] + odata[i - 1]; + } + timer().endCpuTimer(); } @@ -30,9 +35,19 @@ namespace StreamCompaction { */ int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + // Sequentially add up all nonzero input numbers in output array + int matchingCount = 0; + for (int j = 0; j < n; ++j) { + if (idata[j] == 0) + continue; + + odata[matchingCount] = idata[j]; + matchingCount++; + } + timer().endCpuTimer(); - return -1; + return matchingCount; } /** @@ -42,9 +57,40 @@ namespace StreamCompaction { */ int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); - // TODO + + int totalMatchingCount = 0; + + // Populate temp array with 0 if input is 0, 1 otherwise + int *matchingMask = new int[n]; + int *scannedIndices = new int[n]; + for (int i = 0; i < n; ++i) { + if (idata[i]) { + matchingMask[i] = 1; + totalMatchingCount++; + } else { + matchingMask[i] = 0; + } + } + + // Run ye olde scan (without timer since we're already timing) + scannedIndices[0] = 0; + for (int i = 1; i < n; ++i) { + scannedIndices[i] = matchingMask[i - 1] + scannedIndices[i - 1]; + } + + // Pull from input using scanned indices + for (int i = 0; i < n; ++i) { + if (!matchingMask[i]) + continue; + + odata[scannedIndices[i]] = idata[i]; + } + + delete[] matchingMask; + delete[] scannedIndices; + timer().endCpuTimer(); - return -1; + return totalMatchingCount; } } } diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.cuh similarity index 93% rename from stream_compaction/cpu.h rename to stream_compaction/cpu.cuh index 873c0476..06aecb50 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.cuh @@ -1,6 +1,6 @@ #pragma once -#include "common.h" +#include "common.cuh" namespace StreamCompaction { namespace CPU { diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..f0d88da1 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -1,7 +1,11 @@ +#include "common.cuh" +#include "efficient.cuh" #include #include -#include "common.h" -#include "efficient.h" +#include + +// this has gotta be a power of 2! +#define BLOCK_SIZE 256 namespace StreamCompaction { namespace Efficient { @@ -12,15 +16,210 @@ namespace StreamCompaction { return timer; } + /** + * Perform segmented addition scan on an array, separated into blocks. + * + * @param N the number of total elements. Expected to be a power of 2. + * @param maxDepth should be ilog2ceil(N). This should just be precomputed. + * @param g_arrayToScan the full array, in global memory. This will be modified in-place. + * This is expected to be of size N. + * @param g_blockTotalSums where the full sums of each block will be saved. + * It is expected to be of size `N / blockDim.x`. + */ + __global__ void kernExclusiveScanByBlocks(int N, int maxDepth, int *g_arrayToScan, + int *g_blockTotalSums) { + // TODO: probably use shared memory to make this faster, instead of pulling from evil + // global world... + + int blockStartIndex = (blockIdx.x * blockDim.x); + int localThreadIndex = threadIdx.x; + int globalThreadIndex = blockStartIndex + localThreadIndex; + + // Do awesome upsweep in-place with increasing depth + for (int d = 0; d < maxDepth; ++d) { + int halfChunk = 1 << d; + int fullChunk = halfChunk << 1; + + // Each layer gets blockSize >> (d + 1) operations + int numThreads = blockDim.x / fullChunk; + + if (localThreadIndex < numThreads) { + // K is global index of first element of "chunk" we're operating on + int globalK = blockStartIndex + localThreadIndex * fullChunk; + g_arrayToScan[globalK + fullChunk - 1] += + g_arrayToScan[globalK + halfChunk - 1]; + } + __syncthreads(); + } + + // Save the last goober of each block for later, reset to 0 for down-sweep + if (threadIdx.x == BLOCK_SIZE - 1) { + g_blockTotalSums[blockIdx.x] = g_arrayToScan[globalThreadIndex]; + g_arrayToScan[globalThreadIndex] = 0; + } + __syncthreads(); + + // Do awesome downsweep in-place with decreasing depth + for (int d = maxDepth - 1; d >= 0; --d) { + int halfChunk = 1 << d; + int fullChunk = halfChunk << 1; + + // Each layer gets blockSize >> (d + 1) operations + int numThreads = blockDim.x / fullChunk; + + if (localThreadIndex < numThreads) { + // K is global index of first element of "chunk" we're operating on + int globalK = blockStartIndex + localThreadIndex * fullChunk; + + // Copy right value, add left one in-place, then set left to copied value + int oldRightValue = g_arrayToScan[globalK + fullChunk - 1]; + g_arrayToScan[globalK + fullChunk - 1] += + g_arrayToScan[globalK + halfChunk - 1]; + g_arrayToScan[globalK + halfChunk - 1] = oldRightValue; + } + __syncthreads(); + } + } + + /** + * Perform exclusive scan on a single block of data. + * + * @param N the number of elements in the block. Expected to be a power of 2. + * @param maxDepth should be ilog2ceil(N). This should just be precomputed. + * @param g_data the array to scan in global memory. This will be modified in-place. + */ + __global__ void kernExclusiveScanOneBlock(int N, int maxDepth, int *g_arrayToScan) { + // TODO: probably use shared memory to make this faster, instead of pulling from evil + // global world... + + int localThreadIndex = threadIdx.x; + + // Do upsweep in-place with increasing depth + for (int d = 0; d < maxDepth; ++d) { + int halfChunk = 1 << d; + int fullChunk = halfChunk << 1; + + // Each layer gets blockSize >> (d + 1) operations + int numThreads = blockDim.x / fullChunk; + + if (localThreadIndex < numThreads) { + // K is index of first element of "chunk" we're operating on + int k = localThreadIndex * fullChunk; + g_arrayToScan[k + fullChunk - 1] += g_arrayToScan[k + halfChunk - 1]; + } + __syncthreads(); + } + + // Reset the last element to 0 for down-sweep + if (threadIdx.x == 0) { + g_arrayToScan[N - 1] = 0; + } + __syncthreads(); + + // Do downsweep in-place with decreasing depth + for (int d = maxDepth - 1; d >= 0; --d) { + int halfChunk = 1 << d; + int fullChunk = halfChunk << 1; + + // Each layer gets blockSize >> (d + 1) operations + int numThreads = blockDim.x / fullChunk; + + if (localThreadIndex < numThreads) { + // K is index of first element of "chunk" we're operating on + int k = localThreadIndex * fullChunk; + + // Copy right value, add left one in-place, then set left to copied value + int oldRightValue = g_arrayToScan[k + fullChunk - 1]; + g_arrayToScan[k + fullChunk - 1] += g_arrayToScan[k + halfChunk - 1]; + g_arrayToScan[k + halfChunk - 1] = oldRightValue; + } + __syncthreads(); + } + } + + __global__ void kernAddChunkedSums(int N, int *g_chunkScannedArray, int *g_blockTotalSums) { + int globalThreadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + g_chunkScannedArray[globalThreadIndex] += g_blockTotalSums[blockIdx.x]; + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); + void scan(int n, int *odata, const int *idata, bool startGpuTimer) { + int maxDepth = ilog2ceil(n); + int N = 1 << maxDepth; + + // Pointers to all the extra scans needed when block size is smaller than N + std::vector dev_arrays; + + // Create device mem pointers with decreasing sizes + int currentSize = N; + while (currentSize > BLOCK_SIZE) { + int *dev_array; + cudaMalloc(&dev_array, currentSize * sizeof(int)); + dev_arrays.push_back(dev_array); + currentSize = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE; + } + + // Allocate one just for the smallest, always occurs + int *dev_smallest_array; + cudaMalloc(&dev_smallest_array, currentSize * sizeof(int)); + dev_arrays.push_back(dev_smallest_array); + + // Copy input data to first device array + cudaMemcpy(dev_arrays[0], idata, n * sizeof(int), cudaMemcpyHostToDevice); + // Fill rest of first array with zeros if n < N + if (n < N) { + cudaMemset(dev_arrays[0] + n, 0, (N - n) * sizeof(int)); + } + + if (startGpuTimer) + timer().startGpuTimer(); + + // Iterate through arrays from largest to smallest, performing scan on each level + currentSize = N; + for (int i = 0; i < dev_arrays.size() - 1; ++i) { + dim3 blocksPerGrid = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernExclusiveScanByBlocks<<>>( + currentSize, ilog2(currentSize), dev_arrays[i], dev_arrays[i + 1]); + checkCUDAError("kernExclusiveScanByBlocks failed!"); + cudaDeviceSynchronize(); + + currentSize = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE; + } + + // Run scan on smallest block of all time + kernExclusiveScanOneBlock<<<1, BLOCK_SIZE>>>(currentSize, ilog2(currentSize), + dev_smallest_array); + checkCUDAError("kernExclusiveScanOneBlock failed!"); + cudaDeviceSynchronize(); + + // Iterate back up through arrays from smallest to largest, adding chunk sums + for (int i = dev_arrays.size() - 2; i >= 0; --i) { + currentSize *= BLOCK_SIZE; + + dim3 blocksPerGrid = (currentSize + BLOCK_SIZE - 1) / BLOCK_SIZE; + kernAddChunkedSums<<>>(currentSize, dev_arrays[i], + dev_arrays[i + 1]); + checkCUDAError("kernAddChunkedSums failed!"); + cudaDeviceSynchronize(); + } + + if (startGpuTimer) + timer().endGpuTimer(); + + // Copy result back to host + cudaMemcpy(odata, dev_arrays[0], n * sizeof(int), cudaMemcpyDeviceToHost); + + // Deallocate all device pointers + for (int *devicePtr : dev_arrays) { + cudaFree(devicePtr); + } } + // by default: run GPU timer + void scan(int n, int *odata, const int *idata) { scan(n, odata, idata, true); } + /** * Performs stream compaction on idata, storing the result into odata. * All zeroes are discarded. @@ -31,10 +230,54 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { + dim3 blocksPerGrid = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + + int *dev_idata; + int *dev_bools; + int *dev_indices; + int *dev_odata; + cudaMalloc(&dev_idata, n * sizeof(int)); + cudaMalloc(&dev_bools, n * sizeof(int)); + cudaMalloc(&dev_indices, n * sizeof(int)); + cudaMalloc(&dev_odata, n * sizeof(int)); + + // Copy stuffs over to GPU world + cudaMemcpy(dev_idata, idata, n * sizeof(int), cudaMemcpyHostToDevice); + checkCUDAError("cudaMemcpy from idata to dev_idata failed!"); + timer().startGpuTimer(); - // TODO + + // Map our input numbers to bools + Common::kernMapToBoolean<<>>(n, dev_bools, dev_idata); + cudaDeviceSynchronize(); + checkCUDAError("kernMapToBoolean failed!"); + + // Scan em into dev_indices!!! + Efficient::scan(n, dev_indices, dev_bools, false); + + // Now scatter into the output + Common::kernScatter<<>>(n, dev_odata, dev_idata, dev_bools, + dev_indices); + cudaDeviceSynchronize(); + checkCUDAError("kernScatter failed!"); + timer().endGpuTimer(); - return -1; + + cudaMemcpy(odata, dev_odata, n * sizeof(int), cudaMemcpyDeviceToHost); + checkCUDAError("cudaMemcpy from dev_odata to odata failed!"); + + // Get the count of compacted elements + int lastBool, lastIndex; + cudaMemcpy(&lastBool, &dev_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastIndex, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + int matchingCount = lastIndex + lastBool; + + cudaFree(dev_idata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_odata); + + return matchingCount; } } } diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.cuh similarity index 72% rename from stream_compaction/efficient.h rename to stream_compaction/efficient.cuh index 803cb4fe..dc38c1b7 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.cuh @@ -1,11 +1,12 @@ #pragma once -#include "common.h" +#include "common.cuh" namespace StreamCompaction { namespace Efficient { StreamCompaction::Common::PerformanceTimer& timer(); + void scan(int n, int *odata, const int *idata, bool runGpuTimer); void scan(int n, int *odata, const int *idata); int compact(int n, int *odata, const int *idata); diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..9667e848 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -1,7 +1,9 @@ #include #include -#include "common.h" -#include "naive.h" +#include "common.cuh" +#include "naive.cuh" + +#define BLOCK_SIZE 256 namespace StreamCompaction { namespace Naive { @@ -11,15 +13,77 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } - // TODO: __global__ + + __global__ void kernShiftForExclusiveScan(int N, int *dest, int *src) { + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= N) + return; + + // Set up first element as zero, shift everything else right + if (threadIndex == 0) { + dest[0] = 0; + } else if (threadIndex < N) { + dest[threadIndex] = src[threadIndex - 1]; + } + } + + __global__ void kernScanIteration(int N, int offset, int *dest, int *src) { + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= N) + return; + + if (threadIndex >= offset) { + dest[threadIndex] = src[threadIndex] + src[threadIndex - offset]; + } else { + dest[threadIndex] = src[threadIndex]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + dim3 blocksPerGrid = ((n + BLOCK_SIZE - 1) / BLOCK_SIZE); + + int *dev_scanA; + int *dev_scanB; + cudaMalloc(&dev_scanA, sizeof(int) * n); + checkCUDAError("cudaMalloc dev_scanA failed!"); + cudaMalloc(&dev_scanB, sizeof(int) * n); + checkCUDAError("cudaMalloc dev_scanB failed!"); + + // Send our input info over to the GPU + cudaMemcpy(dev_scanA, idata, sizeof(int) * n, cudaMemcpyHostToDevice); + timer().startGpuTimer(); - // TODO + + // Shift things over by one index + kernShiftForExclusiveScan<<>>(n, dev_scanB, dev_scanA); + checkCUDAError("kernShiftForExclusiveScan failed!"); + cudaDeviceSynchronize(); + + int *src = dev_scanB; + int *dest = dev_scanA; + + // Run scan iteratively + for (int offset = 1; offset < n; offset *= 2) { + kernScanIteration<<>>(n, offset, dest, src); + checkCUDAError("kernScanIteration failed!"); + cudaDeviceSynchronize(); + + // Swap src and dest pointers for next iteration + int *temp = src; + src = dest; + dest = temp; + } + timer().endGpuTimer(); + + // Take our output data back to the CPU + cudaMemcpy(odata, src, sizeof(int) * n, cudaMemcpyDeviceToHost); + + cudaFree(dev_scanA); + cudaFree(dev_scanB); } } } diff --git a/stream_compaction/naive.h b/stream_compaction/naive.cuh similarity index 89% rename from stream_compaction/naive.h rename to stream_compaction/naive.cuh index 37dcb064..fbcd394b 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.cuh @@ -1,6 +1,6 @@ #pragma once -#include "common.h" +#include "common.cuh" namespace StreamCompaction { namespace Naive { diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 00000000..f1376cb5 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,151 @@ +#include "common.cuh" +#include "efficient.cuh" +#include "radix.cuh" + +#include +#include +#include + +#define BLOCK_SIZE 256 + +namespace StreamCompaction { +namespace Radix { + +using StreamCompaction::Common::PerformanceTimer; +PerformanceTimer &timer() { + static PerformanceTimer timer; + return timer; +} + +void cpu_sort(int n, int *odata, const int *idata) { + int *idata_mut = new int[n]; + int *odata_mut = odata; + + // copy + std::copy(idata, idata + n, idata_mut); + + Radix::timer().startCpuTimer(); + + for (int bit = 0; bit < 32; ++bit) { + // for each bit, ping-pong + int currentIndex = 0; + + for (int i = 0; i < n; ++i) { + int shouldPartitionRight = (idata_mut[i] >> bit) & 1; + if (shouldPartitionRight) + continue; + + odata_mut[currentIndex] = idata_mut[i]; + currentIndex++; + } + + for (int i = 0; i < n; ++i) { + int shouldPartitionRight = (idata_mut[i] >> bit) & 1; + if (!shouldPartitionRight) + continue; + + odata_mut[currentIndex] = idata_mut[i]; + currentIndex++; + } + + int *temp = idata_mut; + idata_mut = odata_mut; + odata_mut = temp; + } + + // Copy final result to odata if pinged to the wrong pong + if (idata_mut != odata) { + std::copy(idata_mut, idata_mut + n, odata); + } + + Radix::timer().endCpuTimer(); + + delete[] idata_mut; +} + +/** + * Runs custom scatter operation for radix sort partitioning. + * + * @param falseIndices An exclusive scan of the `bools` array + * @param totalFalses The total number of 1 bits in `bools` + */ +__global__ void kernCustomScatter(int n, int *odata, const int *idata, const int *falseBools, + const int *falseIndices, int totalFalses) { + int threadIndex = threadIdx.x + (blockIdx.x * blockDim.x); + if (threadIndex >= n) + return; + + int falseIndex = falseIndices[threadIndex]; + int trueIndex = threadIndex - falseIndex + totalFalses; + + int outputIndex = falseBools[threadIndex] ? falseIndex : trueIndex; + odata[outputIndex] = idata[threadIndex]; +} + +/** + * Performs radix sort on idata, storing the result into odata. + */ +void sort(int n, int *odata, const int *idata) { + dim3 blocksPerGrid = (n + BLOCK_SIZE - 1) / BLOCK_SIZE; + + int *dev_input; + int *dev_output; + int *dev_falseBools; + int *dev_falseIndices; + + cudaMalloc(&dev_input, n * sizeof(int)); + cudaMalloc(&dev_output, n * sizeof(int)); + cudaMalloc(&dev_falseBools, n * sizeof(int)); + cudaMalloc(&dev_falseIndices, n * sizeof(int)); + + // Copy input data to device + cudaMemcpy(dev_input, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + // Process each bit position (32 bits for int) + for (int bit = 0; bit < 32; ++bit) { + // Map elements to boolean based on current bit + Common::kernMapToBit<<>>(n, dev_falseBools, dev_input, bit); + cudaDeviceSynchronize(); + checkCUDAError("kernMapToBit failed!"); + + // Flip all of our new booleans (this results in the f array!) + Common::kernInvert<<>>(n, dev_falseBools); + cudaDeviceSynchronize(); + checkCUDAError("kernInvert failed!"); + + // Scan the f array to get "false" scatter indices + Efficient::scan(n, dev_falseIndices, dev_falseBools, false); + + // Grab total falses out of last array elements + int lastBool, lastFalseIndex; + cudaMemcpy(&lastBool, &dev_falseBools[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&lastFalseIndex, &dev_falseIndices[n - 1], sizeof(int), cudaMemcpyDeviceToHost); + int totalFalses = lastBool + lastFalseIndex; + + // Scatter elements based on bit value (0s first, then 1s) + Radix::kernCustomScatter<<>>( + n, dev_output, dev_input, dev_falseBools, dev_falseIndices, totalFalses); + cudaDeviceSynchronize(); + checkCUDAError("kernScatterRadix failed!"); + + // Swap input and output buffers for next iteration + int *temp = dev_input; + dev_input = dev_output; + dev_output = temp; + } + + timer().endGpuTimer(); + + // Copy result back to host + cudaMemcpy(odata, dev_input, n * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(dev_input); + cudaFree(dev_output); + cudaFree(dev_falseBools); + cudaFree(dev_falseIndices); +} + +} // namespace Radix +} // namespace StreamCompaction \ No newline at end of file diff --git a/stream_compaction/radix.cuh b/stream_compaction/radix.cuh new file mode 100644 index 00000000..a6e28b90 --- /dev/null +++ b/stream_compaction/radix.cuh @@ -0,0 +1,17 @@ +#pragma once + +#include +#include + +#include "common.cuh" + +namespace StreamCompaction { +namespace Radix { + +StreamCompaction::Common::PerformanceTimer &timer(); + +void cpu_sort(int n, int *odata, const int *idata); +void sort(int n, int *odata, const int *idata); + +} // namespace Radix +} // namespace StreamCompaction \ No newline at end of file diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e7..d00cc013 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -3,8 +3,8 @@ #include #include #include -#include "common.h" -#include "thrust.h" +#include "common.cuh" +#include "thrust.cuh" namespace StreamCompaction { namespace Thrust { @@ -18,11 +18,20 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { + // Copy data over to device vectors + thrust::device_vector dv_in(idata, idata + n); + thrust::device_vector dv_out(n); + timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + // Run epic builtin exclusive scan + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + cudaDeviceSynchronize(); + timer().endGpuTimer(); + + // Copy output data back to host + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } } diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.cuh similarity index 89% rename from stream_compaction/thrust.h rename to stream_compaction/thrust.cuh index fe98206b..27f7bec8 100644 --- a/stream_compaction/thrust.h +++ b/stream_compaction/thrust.cuh @@ -1,6 +1,6 @@ #pragma once -#include "common.h" +#include "common.cuh" namespace StreamCompaction { namespace Thrust {