diff --git a/.clang-format b/.clang-format new file mode 100644 index 00000000..01cfae2d --- /dev/null +++ b/.clang-format @@ -0,0 +1,236 @@ +# yaml-language-server: $schema=https://json.schemastore.org/clang-format.json +--- +Language: Cpp +AccessModifierOffset: -4 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveAssignments: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveBitFields: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveDeclarations: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveMacros: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveShortCaseStatements: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCaseArrows: false + AlignCaseColons: false +AlignConsecutiveTableGenBreakingDAGArgColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveTableGenCondOperatorColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignConsecutiveTableGenDefinitionColons: + Enabled: false + AcrossEmptyLines: false + AcrossComments: false + AlignCompound: false + AlignFunctionPointers: false + PadOperators: false +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: + Kind: Never + OverEmptyLines: 1 +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowBreakBeforeNoexceptSpecifier: Never +AllowShortBlocksOnASingleLine: Always +AllowShortCaseExpressionOnASingleLine: true +AllowShortCaseLabelsOnASingleLine: true +AllowShortCompoundRequirementOnASingleLine: true +AllowShortEnumsOnASingleLine: true +AllowShortFunctionsOnASingleLine: Empty +AllowShortIfStatementsOnASingleLine: AllIfsAndElse +AllowShortLambdasOnASingleLine: All +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BitFieldColonSpacing: Both +BracedInitializerIndentWidth: 4 +BreakBeforeConceptDeclarations: Always +BreakAdjacentStringLiterals: true +BreakAfterAttributes: Leave +BreakAfterJavaFieldAnnotations: false +BreakAfterReturnType: ExceptShortType +BreakArrays: true +BreakBeforeBinaryOperators: All +BreakBeforeBraces: Allman +BreakBeforeInlineASMColon: OnlyMultiline +BreakBeforeTernaryOperators: true +BreakConstructorInitializers: BeforeComma +BreakFunctionDefinitionParameters: false +BreakInheritanceList: BeforeColon +BreakStringLiterals: true +BreakTemplateDeclarations: Yes +ColumnLimit: 100 +CommentPragmas: "^ IWYU pragma:" +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: false +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: Always +FixNamespaceComments: true +ForEachMacros: + - forever + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: "^ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/.gitignore b/.gitignore index a59ec565..d1638636 100644 --- a/.gitignore +++ b/.gitignore @@ -1,562 +1 @@ -cis565_getting_started -cis565_getting_started_generated_kernel* -*.orig -*.filters -*.sln -*.vcxproj -*.xcodeproj -build - -# Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode - -### Linux ### -*~ - -# KDE directory preferences -.directory - -# Linux trash folder which might appear on any partition or disk -.Trash-* - - -### OSX ### -.DS_Store -.AppleDouble -.LSOverride - -# Icon must end with two \r -Icon - -# Thumbnails -._* - -# Files that might appear in the root of a volume -.DocumentRevisions-V100 -.fseventsd -.Spotlight-V100 -.TemporaryItems -.Trashes -.VolumeIcon.icns - -# Directories potentially created on remote AFP share -.AppleDB -.AppleDesktop -Network Trash Folder -Temporary Items -.apdisk - - -### SublimeText ### -# cache files for sublime text -*.tmlanguage.cache -*.tmPreferences.cache -*.stTheme.cache - -# workspace files are user-specific -*.sublime-workspace - -# project files should be checked into the repository, unless a significant -# proportion of contributors will probably not be using SublimeText -# *.sublime-project - -# sftp configuration file -sftp-config.json - - -### Windows ### -# Windows image file caches -Thumbs.db -ehthumbs.db - -# Folder config file -Desktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msm -*.msp - -# Windows shortcuts -*.lnk - - -### JetBrains ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio - -*.iml - -## Directory-based project format: -#.idea/ -# if you remove the above rule, at least ignore the following: - -# User-specific stuff: -.idea/workspace.xml -.idea/tasks.xml -.idea/dictionaries - -# Sensitive or high-churn files: -.idea/dataSources.ids -.idea/dataSources.xml -.idea/sqlDataSources.xml -.idea/dynamic.xml -.idea/uiDesigner.xml - -# Gradle: -.idea/gradle.xml -.idea/libraries - -# Mongo Explorer plugin: -.idea/mongoSettings.xml - -## File-based project format: -*.ipr -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties - - -### Vim ### -[._]*.s[a-w][a-z] -[._]s[a-w][a-z] -*.un~ -Session.vim -.netrwhist -*~ - - -### Emacs ### -# -*- mode: gitignore; -*- -*~ -\#*\# -/.emacs.desktop -/.emacs.desktop.lock -*.elc -auto-save-list -tramp -.\#* - -# Org-mode -.org-id-locations -*_archive - -# flymake-mode -*_flymake.* - -# eshell files -/eshell/history -/eshell/lastdir - -# elpa packages -/elpa/ - -# reftex files -*.rel - -# AUCTeX auto folder -/auto/ - -# cask packages -.cask/ - - -### CMake ### -CMakeCache.txt -CMakeFiles -CMakeScripts -Makefile -cmake_install.cmake -install_manifest.txt - - -### C++ ### -# Compiled Object files -*.slo -*.lo -*.o -*.obj - -# Precompiled Headers -*.gch -*.pch - -# Compiled Dynamic libraries -*.so -*.dylib -*.dll - -# Fortran module files -*.mod - -# Compiled Static libraries -*.lai -*.la -*.a -*.lib - -# Executables -*.exe -*.out -*.app - - -### CUDA ### -*.i -*.ii -*.gpu -*.ptx -*.cubin -*.fatbin - - -### VisualStudio ### -## Ignore Visual Studio temporary files, build results, and -## files generated by popular Visual Studio add-ons. - -# User-specific files -*.suo -*.user -*.userosscache -*.sln.docstates - -# User-specific files (MonoDevelop/Xamarin Studio) -*.userprefs - -# Build results -[Dd]ebug/ -[Dd]ebugPublic/ -[Rr]elease/ -[Rr]eleases/ -x64/ -x86/ -build/ -bld/ -[Bb]in/ -[Oo]bj/ - -# Visual Studio 2015 cache/options directory -.vs/ -# Uncomment if you have tasks that create the project's static files in wwwroot -#wwwroot/ - -# MSTest test Results -[Tt]est[Rr]esult*/ -[Bb]uild[Ll]og.* - -# NUNIT -*.VisualState.xml -TestResult.xml - -# Build Results of an ATL Project -[Dd]ebugPS/ -[Rr]eleasePS/ -dlldata.c - -# DNX -project.lock.json -artifacts/ - -*_i.c -*_p.c -*_i.h -*.ilk -*.meta -*.obj -*.pch -*.pdb -*.pgc -*.pgd -*.rsp -*.sbr -*.tlb -*.tli -*.tlh -*.tmp -*.tmp_proj -*.log -*.vspscc -*.vssscc -.builds -*.pidb -*.svclog -*.scc - -# Chutzpah Test files -_Chutzpah* - -# Visual C++ cache files -ipch/ -*.aps -*.ncb -*.opensdf -*.sdf -*.cachefile - -# Visual Studio profiler -*.psess -*.vsp -*.vspx - -# TFS 2012 Local Workspace -$tf/ - -# Guidance Automation Toolkit -*.gpState - -# ReSharper is a .NET coding add-in -_ReSharper*/ -*.[Rr]e[Ss]harper -*.DotSettings.user - -# JustCode is a .NET coding add-in -.JustCode - -# TeamCity is a build add-in -_TeamCity* - -# DotCover is a Code Coverage Tool -*.dotCover - -# NCrunch -_NCrunch_* -.*crunch*.local.xml -nCrunchTemp_* - -# MightyMoose -*.mm.* -AutoTest.Net/ - -# Web workbench (sass) -.sass-cache/ - -# Installshield output folder -[Ee]xpress/ - -# DocProject is a documentation generator add-in -DocProject/buildhelp/ -DocProject/Help/*.HxT -DocProject/Help/*.HxC -DocProject/Help/*.hhc -DocProject/Help/*.hhk -DocProject/Help/*.hhp -DocProject/Help/Html2 -DocProject/Help/html - -# Click-Once directory -publish/ - -# Publish Web Output -*.[Pp]ublish.xml -*.azurePubxml -# TODO: Comment the next line if you want to checkin your web deploy settings -# but database connection strings (with potential passwords) will be unencrypted -*.pubxml -*.publishproj - -# NuGet Packages -*.nupkg -# The packages folder can be ignored because of Package Restore -**/packages/* -# except build/, which is used as an MSBuild target. -!**/packages/build/ -# Uncomment if necessary however generally it will be regenerated when needed -#!**/packages/repositories.config - -# Windows Azure Build Output -csx/ -*.build.csdef - -# Windows Store app package directory -AppPackages/ - -# Visual Studio cache files -# files ending in .cache can be ignored -*.[Cc]ache -# but keep track of directories ending in .cache -!*.[Cc]ache/ - -# Others -ClientBin/ -[Ss]tyle[Cc]op.* -~$* -*~ -*.dbmdl -*.dbproj.schemaview -*.pfx -*.publishsettings -node_modules/ -orleans.codegen.cs - -# RIA/Silverlight projects -Generated_Code/ - -# Backup & report files from converting an old project file -# to a newer Visual Studio version. Backup files are not needed, -# because we have git ;-) -_UpgradeReport_Files/ -Backup*/ -UpgradeLog*.XML -UpgradeLog*.htm - -# SQL Server files -*.mdf -*.ldf - -# Business Intelligence projects -*.rdl.data -*.bim.layout -*.bim_*.settings - -# Microsoft Fakes -FakesAssemblies/ - -# Node.js Tools for Visual Studio -.ntvs_analysis.dat - -# Visual Studio 6 build log -*.plg - -# Visual Studio 6 workspace options file -*.opt - -# Visual Studio LightSwitch build output -**/*.HTMLClient/GeneratedArtifacts -**/*.DesktopClient/GeneratedArtifacts -**/*.DesktopClient/ModelManifest.xml -**/*.Server/GeneratedArtifacts -**/*.Server/ModelManifest.xml -_Pvt_Extensions - - -### WebStorm ### -# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio - -*.iml - -## Directory-based project format: -.idea/ -# if you remove the above rule, at least ignore the following: - -# User-specific stuff: -# .idea/workspace.xml -# .idea/tasks.xml -# .idea/dictionaries - -# Sensitive or high-churn files: -# .idea/dataSources.ids -# .idea/dataSources.xml -# .idea/sqlDataSources.xml -# .idea/dynamic.xml -# .idea/uiDesigner.xml - -# Gradle: -# .idea/gradle.xml -# .idea/libraries - -# Mongo Explorer plugin: -# .idea/mongoSettings.xml - -## File-based project format: -*.ipr -*.iws - -## Plugin-specific files: - -# IntelliJ -/out/ - -# mpeltonen/sbt-idea plugin -.idea_modules/ - -# JIRA plugin -atlassian-ide-plugin.xml - -# Crashlytics plugin (for Android Studio and IntelliJ) -com_crashlytics_export_strings.xml -crashlytics.properties -crashlytics-build.properties - - -### Eclipse ### -*.pydevproject -.metadata -.gradle -bin/ -tmp/ -*.tmp -*.bak -*.swp -*~.nib -local.properties -.settings/ -.loadpath - -# Eclipse Core -.project - -# External tool builders -.externalToolBuilders/ - -# Locally stored "Eclipse launch configurations" -*.launch - -# CDT-specific -.cproject - -# JDT-specific (Eclipse Java Development Tools) -.classpath - -# Java annotation processor (APT) -.factorypath - -# PDT-specific -.buildpath - -# sbteclipse plugin -.target - -# TeXlipse plugin -.texlipse - - -### Xcode ### -# Xcode -# -# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore - -## Build generated -build/ -DerivedData - -## Various settings -*.pbxuser -!default.pbxuser -*.mode1v3 -!default.mode1v3 -*.mode2v3 -!default.mode2v3 -*.perspectivev3 -!default.perspectivev3 -xcuserdata - -## Other -*.xccheckout -*.moved-aside -*.xcuserstate +build/ \ No newline at end of file diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 00000000..c435160a --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,10 @@ +# Default ignored files +/shelf/ +/workspace.xml +# Editor-based HTTP Client requests +/httpRequests/ +# Ignored default folder with query files +/queries/ +# Datasource local storage ignored files +/dataSources/ +/dataSources.local.xml diff --git a/.idea/.name b/.idea/.name new file mode 100644 index 00000000..50b67a1a --- /dev/null +++ b/.idea/.name @@ -0,0 +1 @@ +stream_compaction \ No newline at end of file diff --git a/.idea/codeStyles/Project.xml b/.idea/codeStyles/Project.xml new file mode 100644 index 00000000..ad88c0bc --- /dev/null +++ b/.idea/codeStyles/Project.xml @@ -0,0 +1,112 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/codeStyles/codeStyleConfig.xml b/.idea/codeStyles/codeStyleConfig.xml new file mode 100644 index 00000000..307554b7 --- /dev/null +++ b/.idea/codeStyles/codeStyleConfig.xml @@ -0,0 +1,5 @@ + + + + \ No newline at end of file diff --git a/.idea/cuda-stream-compaction-library.iml b/.idea/cuda-stream-compaction-library.iml new file mode 100644 index 00000000..962e49fd --- /dev/null +++ b/.idea/cuda-stream-compaction-library.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/editor.xml b/.idea/editor.xml new file mode 100644 index 00000000..39b2b25f --- /dev/null +++ b/.idea/editor.xml @@ -0,0 +1,353 @@ + + + + + \ No newline at end of file diff --git a/.idea/encodings.xml b/.idea/encodings.xml new file mode 100644 index 00000000..df87cf95 --- /dev/null +++ b/.idea/encodings.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/Project_Default.xml b/.idea/inspectionProfiles/Project_Default.xml new file mode 100644 index 00000000..acfa6e5e --- /dev/null +++ b/.idea/inspectionProfiles/Project_Default.xml @@ -0,0 +1,12 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 00000000..443605b5 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,7 @@ + + + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 00000000..7b01d58d --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 00000000..abd8cd2f --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,9 @@ + + + + + + + + \ No newline at end of file diff --git a/.project b/.project deleted file mode 100644 index d95a4e38..00000000 --- a/.project +++ /dev/null @@ -1,27 +0,0 @@ - - - Project2-Stream-Compaction - - - - - - org.eclipse.cdt.managedbuilder.core.genmakebuilder - clean,full,incremental, - - - - - org.eclipse.cdt.managedbuilder.core.ScannerConfigBuilder - full,incremental, - - - - - - org.eclipse.cdt.core.cnature - org.eclipse.cdt.core.ccnature - org.eclipse.cdt.managedbuilder.core.managedBuildNature - org.eclipse.cdt.managedbuilder.core.ScannerConfigNature - - diff --git a/.vscode/launch.json b/.vscode/launch.json new file mode 100644 index 00000000..25bc73bd --- /dev/null +++ b/.vscode/launch.json @@ -0,0 +1,21 @@ +{ + "version": "0.2.0", + "configurations": [ + { + "type": "cmake", + "request": "launch", + "name": "CMake: Configure project", + "cmakeDebugType": "configure", + "clean": false, + "configureAll": false + }, + { + "name": "CUDA C++: Launch", + "type": "cuda-gdb", + "request": "launch", + "program": "${command:cmake.launchTargetPath}", + "cwd": "${workspaceFolder}", + "environment": [] + } + ] +} diff --git a/CMakeLists.txt b/CMakeLists.txt index 610c27d4..469acedd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,46 +1,88 @@ -cmake_minimum_required(VERSION 3.18) -project(cis5650_stream_compaction_test LANGUAGES CUDA CXX) +cmake_minimum_required(VERSION 3.24) +project(stream_compaction LANGUAGES CUDA CXX) set_property(GLOBAL PROPERTY USE_FOLDERS ON) +set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_CUDA_SEPARABLE_COMPILATION ON) # force compilation as separate objects -set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/lib) -set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) - -# Enable C++11 for host code -set(CMAKE_CXX_STANDARD 17) -set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CUDA_STANDARD 20) set(CMAKE_CUDA_STANDARD_REQUIRED ON) -# Set a default build type if none was specified -if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) - SET(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE) - # Set the possible values of build type for cmake-gui - SET_PROPERTY(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo") -endif() +# force consistent dynamic (as opposed to static) runtime +set(CMAKE_MSVC_RUNTIME_LIBRARY "MultiThreaded$<$:Debug>DLL") +set(CMAKE_CUDA_RUNTIME_LIBRARY Hybrid) -if(UNIX) - include_directories("${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}") -endif(UNIX) +list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake") add_subdirectory(stream_compaction) -include_directories(.) +# allow enable/disable of tests target +option(STREAM_COMPACTION_BUILD_TESTS "Build tests" OFF) +if(PROJECT_IS_TOP_LEVEL) + set(STREAM_COMPACTION_BUILD_TESTS ON) # default for `ExternalProject`, `FetchContent`, etc will be `OFF` +endif() + +# configure tests target +if(STREAM_COMPACTION_BUILD_TESTS) + set(STREAM_COMPACTION_TESTS_TARGET ${PROJECT_NAME}_tests) + set(STREAM_COMPACTION_TESTS_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/tests) -set(headers - "src/testing_helpers.hpp" + set(TESTS_HEADERS + "${STREAM_COMPACTION_TESTS_ROOT}/testing_helpers.h" + "${STREAM_COMPACTION_TESTS_ROOT}/test_utils.h" + ) + set(TESTS_SOURCES + "${STREAM_COMPACTION_TESTS_ROOT}/main.cpp" + "${STREAM_COMPACTION_TESTS_ROOT}/scan_tests.cpp" + "${STREAM_COMPACTION_TESTS_ROOT}/radix_sort_tests.cpp" + "${STREAM_COMPACTION_TESTS_ROOT}/stream_compaction_tests.cpp" ) -set(sources - "src/main.cpp" + add_executable(${STREAM_COMPACTION_TESTS_TARGET} ${TESTS_SOURCES} ${TESTS_HEADERS}) + + # configure GoogleTest + include(FetchContent) + FetchContent_Declare( + googletest + URL https://github.com/google/googletest/archive/refs/heads/main.zip ) -list(SORT headers) -list(SORT sources) + if(WIN32) + set(gtest_force_shared_crt ON CACHE BOOL "" FORCE) # force shared runtime + endif() + FetchContent_MakeAvailable(googletest) -source_group(Headers FILES ${headers}) -source_group(Sources FILES ${sources}) + enable_testing() -add_executable(${CMAKE_PROJECT_NAME} ${sources} ${headers}) -target_link_libraries(${CMAKE_PROJECT_NAME} stream_compaction) -set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${CMAKE_PROJECT_NAME}) + target_link_libraries(${STREAM_COMPACTION_TESTS_TARGET} PRIVATE stream_compaction::stream_compaction GTest::gtest_main) + + include(GoogleTest) + gtest_discover_tests(${STREAM_COMPACTION_TESTS_TARGET} DISCOVERY_MODE PRE_TEST DISCOVERY_TIMEOUT 100) # auto-discover tests for CTest + + if(CMAKE_GENERATOR MATCHES "Visual Studio") + # make tests target the startup project in VS + set_property(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} PROPERTY VS_STARTUP_PROJECT ${STREAM_COMPACTION_TESTS_TARGET}) + endif() + + if(WIN32) + # DLL must sit next to executable on Windows + add_custom_command(TARGET ${STREAM_COMPACTION_TESTS_TARGET} + POST_BUILD + COMMAND ${CMAKE_COMMAND} -E copy_if_different + $ + $ + ) + endif() +endif() + +if(MSVC) + # needed for proper intellisense when using VS or Rider with a VS solution + set(CMAKE_VS_SDK_INCLUDE_DIRECTORIES + "$(VC_IncludePath)" + "$(WindowsSDK_IncludePath)" + "${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}" + "${CMAKE_SOURCE_DIR}" +) +endif() diff --git a/CMakePresets.json b/CMakePresets.json new file mode 100644 index 00000000..79665476 --- /dev/null +++ b/CMakePresets.json @@ -0,0 +1,44 @@ +{ + "version": 10, + "configurePresets": [ + { + "name": "default_configure", + "displayName": "Default Configuration", + "hidden": true + }, + { + "name": "windows_configure", + "inherits": "default_configure", + "displayName": "Windows Default Configuration", + "binaryDir": "build", + "condition": { + "type": "equals", + "lhs": "${hostSystemName}", + "rhs": "Windows" + } + } + ], + "buildPresets": [ + { + "name": "windows_build", + "configurePreset": "windows_configure", + "displayName": "Windows Default Build" + } + ], + "workflowPresets": [ + { + "name": "windows_workflow", + "steps": [ + { + "type": "configure", + "name": "windows_configure" + }, + { + "type": "build", + "name": "windows_build" + } + ], + "displayName": "Windows Default Workflow" + } + ] +} \ No newline at end of file diff --git a/GNUmakefile b/GNUmakefile deleted file mode 100644 index 2b433114..00000000 --- a/GNUmakefile +++ /dev/null @@ -1,31 +0,0 @@ -CMAKE_ALT1 := /usr/local/bin/cmake -CMAKE_ALT2 := /Applications/CMake.app/Contents/bin/cmake -CMAKE := $(shell \ - which cmake 2>/dev/null || \ - ([ -e ${CMAKE_ALT1} ] && echo "${CMAKE_ALT1}") || \ - ([ -e ${CMAKE_ALT2} ] && echo "${CMAKE_ALT2}") \ - ) - -all: Release - - -Debug: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - -MinSizeRel: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - -Release: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - -RelWithDebugInfo: build - (cd build && ${CMAKE} -DCMAKE_BUILD_TYPE=$@ .. && make) - - -build: - mkdir -p build - -clean: - ((cd build && make clean) 2>&- || true) - -.PHONY: all Debug MinSizeRel Release RelWithDebugInfo clean diff --git a/INSTRUCTION.md b/INSTRUCTION.md index f2fc82cd..096717de 100644 --- a/INSTRUCTION.md +++ b/INSTRUCTION.md @@ -1,5 +1,4 @@ -Project 2 Stream Compaction Instructions -======================== +# Project 2 Stream Compaction Instructions This is due **Tuesday, September 16 2025 at 11:59pm**. @@ -17,7 +16,7 @@ algorithms can benefit from massive parallelism and, in particular, data parallelism: executing the same code many times simultaneously with different data. -You'll implement a few different versions of the *Scan* (*Prefix Sum*) +You'll implement a few different versions of the _Scan_ (_Prefix Sum_) algorithm. First, you'll implement a CPU version of the algorithm to reinforce your understanding. Then, you'll write a few GPU implementations: "naive" and "work-efficient." Finally, you'll use some of these to implement GPU stream @@ -26,13 +25,13 @@ compaction. **Algorithm overview & details:** There are two primary references for details on the implementation of scan and stream compaction. -* The [slides on Parallel Algorithms](https://docs.google.com/presentation/d/1ETVONA7QDM-WqsEj4qVOGD6Kura5I6E9yqH-7krnwZ0/edit#slide=id.p126) +- The [slides on Parallel Algorithms](https://docs.google.com/presentation/d/1ETVONA7QDM-WqsEj4qVOGD6Kura5I6E9yqH-7krnwZ0/edit#slide=id.p126) for Scan, Stream Compaction, and Work-Efficient Parallel Scan. -* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html). +- GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html). - This online version contains a few small errors (in superscripting, missing braces, bad indentation, etc.) - We maintain a fix for this at [GPU Gem 3 Ch 39 Patch](https://github.com/CIS565-Fall-2017/Project2-Stream-Compaction/blob/master/INSTRUCTION.md#gpu-gem-3-ch-39-patch). If you find more errors in the chapter, welcome to open new pull requests to contribute. -* If you are still unclear after reading the steps, take a look at the last chapter - [Algorithm Examples](https://github.com/CIS565-Fall-2017/Project2-Stream-Compaction/blob/master/INSTRUCTION.md#algorithm-examples). -* [Recitation slides](https://docs.google.com/presentation/d/1daOnWHOjMp1sIqMdVsNnvEU1UYynKcEMARc_W6bGnqE/edit?usp=sharing) +- If you are still unclear after reading the steps, take a look at the last chapter - [Algorithm Examples](https://github.com/CIS565-Fall-2017/Project2-Stream-Compaction/blob/master/INSTRUCTION.md#algorithm-examples). +- [Recitation slides](https://docs.google.com/presentation/d/1daOnWHOjMp1sIqMdVsNnvEU1UYynKcEMARc_W6bGnqE/edit?usp=sharing) Your GPU stream compaction implementation will live inside of the `stream_compaction` subproject. In this way, you will be able to easily copy it @@ -49,11 +48,11 @@ computers in Moore 100 or the SIG Lab which have supported GPUs. ### Useful existing code -* `stream_compaction/common.h` - * `checkCUDAError` macro: checks for CUDA errors and exits if there were any. - * `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. -* `main.cpp` - * Some testing code for your implementations. +- `stream_compaction/common.h` + - `checkCUDAError` macro: checks for CUDA errors and exits if there were any. + - `ilog2ceil(x)`: computes the ceiling of log2(x), as an integer. +- `main.cpp` + - Some testing code for your implementations. **Note 1:** The tests will simply compare against your CPU implementation Do it first! @@ -70,10 +69,10 @@ value for the other tests. In `stream_compaction/cpu.cu`, implement: -* `StreamCompaction::CPU::scan`: compute an exclusive prefix sum. For performance comparison, this is supposed to be a simple `for` loop. But for better understanding before starting moving to GPU, you can simulate the GPU scan in this function first. -* `StreamCompaction::CPU::compactWithoutScan`: stream compaction without using +- `stream_compaction::CPU::scan`: compute an exclusive prefix sum. For performance comparison, this is supposed to be a simple `for` loop. But for better understanding before starting moving to GPU, you can simulate the GPU scan in this function first. +- `stream_compaction::CPU::compactWithoutScan`: stream compaction without using the `scan` function. -* `StreamCompaction::CPU::compactWithScan`: stream compaction using the `scan` +- `stream_compaction::CPU::compactWithScan`: stream compaction using the `scan` function. Map the input array to an array of 0s and 1s, scan it, and use scatter to produce the output. You will need a **CPU** scatter implementation for this (see slides or GPU Gems chapter for an explanation). @@ -82,7 +81,7 @@ These implementations should only be a few lines long. ## Part 2: Naive GPU Scan Algorithm -In `stream_compaction/naive.cu`, implement `StreamCompaction::Naive::scan` +In `stream_compaction/naive.cu`, implement `stream_compaction::Naive::scan` This uses the "Naive" algorithm from GPU Gems 3, Section 39.2.1. Example 39-1 uses shared memory. This is not required in this project. You can simply use global memory. As a result of this, you will have to do `ilog2ceil(n)` separate kernel invocations. @@ -102,50 +101,51 @@ Be sure to test non-power-of-two-sized arrays. ### 3.1. Scan In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::scan` +`stream_compaction::Efficient::scan` Most of the text in Part 2 applies. -* This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. -* This can be done in place - it doesn't suffer from the race conditions of +- This uses the "Work-Efficient" algorithm from GPU Gems 3, Section 39.2.2. +- This can be done in place - it doesn't suffer from the race conditions of the naive method, since there won't be a case where one thread writes to and another thread reads from the same location in the array. -* Beware of errors in Example 39-2. -* Test non-power-of-two-sized arrays. +- Beware of errors in Example 39-2. +- Test non-power-of-two-sized arrays. Since the work-efficient scan operates on a binary tree structure, it works best with arrays with power-of-two length. Make sure your implementation works on non-power-of-two sized arrays (see `ilog2ceil`). This requires extra memory + - your intermediate array sizes will need to be rounded to the next power of -two. + two. ### 3.2. Stream Compaction This stream compaction method will remove `0`s from an array of `int`s. In `stream_compaction/efficient.cu`, implement -`StreamCompaction::Efficient::compact` +`stream_compaction::Efficient::compact` For compaction, you will also need to implement the scatter algorithm presented in the slides and the GPU Gems chapter. In `stream_compaction/common.cu`, implement these for use in `compact`: -* `StreamCompaction::Common::kernMapToBoolean` -* `StreamCompaction::Common::kernScatter` +- `stream_compaction::common::kernMapToBoolean` +- `stream_compaction::common::kernScatter` ## Part 4: Using Thrust's Implementation In `stream_compaction/thrust.cu`, implement: -* `StreamCompaction::Thrust::scan` +- `stream_compaction::Thrust::scan` This should be a very short function which wraps a call to the Thrust library function `thrust::exclusive_scan(first, last, result)`. To measure timing, be sure to exclude memory operations by passing `exclusive_scan` a `thrust::device_vector` (which is already allocated on the -GPU). You can create a `thrust::device_vector` by creating a +GPU). You can create a `thrust::device_vector` by creating a `thrust::host_vector` from the given pointer, then casting it. For thrust stream compaction, take a look at [thrust::remove_if](https://thrust.github.io/doc/group__stream__compaction.html). It's not required to analyze `thrust::remove_if` but you're encouraged to do so. @@ -159,6 +159,7 @@ Though it is totally acceptable for this assignment, In addition to explain the reason of this phenomena, you are encouraged to try to upgrade your work-efficient gpu scan. Thinking about these may lead you to an aha moment: + - What is the occupancy at a deeper level in the upper/down sweep? Are most threads actually working? - Are you always launching the same number of blocks throughout each level of the upper/down sweep? - If some threads are being lazy, can we do an early termination on them? @@ -199,31 +200,29 @@ Always profile with Release mode builds and run without debugging. ### Questions -* Roughly optimize the block sizes of each of your implementations for minimal +- Roughly optimize the block sizes of each of your implementations for minimal run time on your GPU. - * (You shouldn't compare unoptimized implementations to each other!) + - (You shouldn't compare unoptimized implementations to each other!) -* Compare all of these GPU Scan implementations (Naive, Work-Efficient, and +- Compare all of these GPU Scan implementations (Naive, Work-Efficient, and Thrust) to the serial CPU version of Scan. Plot a graph of the comparison (with array size on the independent axis). - * We wrapped up both CPU and GPU timing functions as a performance timer class for you to conveniently measure the time cost. - * We use `std::chrono` to provide CPU high-precision timing and CUDA event to measure the CUDA performance. - * For CPU, put your CPU code between `timer().startCpuTimer()` and `timer().endCpuTimer()`. - * For GPU, put your CUDA code between `timer().startGpuTimer()` and `timer().endGpuTimer()`. Be sure **not** to include any *initial/final* memory operations (`cudaMalloc`, `cudaMemcpy`) in your performance measurements, for comparability. - * Don't mix up `CpuTimer` and `GpuTimer`. - * To guess at what might be happening inside the Thrust implementation (e.g. - allocation, memory copy), take a look at the Nsight timeline for its - execution. Your analysis here doesn't have to be detailed, since you aren't - even looking at the code for the implementation. - -* Write a brief explanation of the phenomena you see here. - * Can you find the performance bottlenecks? Is it memory I/O? Computation? Is - it different for each implementation? - -* Paste the output of the test program into a triple-backtick block in your + - We wrapped up both CPU and GPU timing functions as a performance timer class for you to conveniently measure the time cost. + - We use `std::chrono` to provide CPU high-precision timing and CUDA event to measure the CUDA performance. + - Don't mix up CPU timers and GPU timers. + - To guess at what might be happening inside the Thrust implementation (e.g. + allocation, memory copy), take a look at the Nsight timeline for its + execution. Your analysis here doesn't have to be detailed, since you aren't + even looking at the code for the implementation. + +- Write a brief explanation of the phenomena you see here. + - Can you find the performance bottlenecks? Is it memory I/O? Computation? Is + it different for each implementation? + +- Paste the output of the test program into a triple-backtick block in your README. - * If you add your own tests (e.g. for radix sort or to test additional corner - cases), be sure to mention it explicitly. + - If you add your own tests (e.g. for radix sort or to test additional corner + cases), be sure to mention it explicitly. These questions should help guide you in performance analysis on future assignments, as well. @@ -238,72 +237,72 @@ Open a GitHub pull request so that we can see that you have finished. The title should be "Project 2: YOUR NAME". The template of the comment section of your pull request is attached below, you can do some copy and paste: -* [Repo Link](https://link-to-your-repo) -* (Briefly) Mentions features that you've completed. Especially those bells and whistles you want to highlight - * Feature 0 - * Feature 1 - * ... -* Feedback on the project itself, if any. +- [Repo Link](https://link-to-your-repo) +- (Briefly) Mentions features that you've completed. Especially those bells and whistles you want to highlight + - Feature 0 + - Feature 1 + - ... +- Feedback on the project itself, if any. ## GPU Gem 3 Ch 39 Patch -* Example 1 -![](img/example-1.png) +- Example 1 + ![](img/example-1.png) -* Example 2 -![](img/example-2.jpg) +- Example 2 + ![](img/example-2.jpg) -* Figure-39-4 -![](img/figure-39-4.jpg) +- Figure-39-4 + ![](img/figure-39-4.jpg) -* Figure-39-2. This image shows an naive inclusive scan. We should convert this to an exclusive one for compaction. -![](img/figure-39-2.jpg) +- Figure-39-2. This image shows an naive inclusive scan. We should convert this to an exclusive one for compaction. + ![](img/figure-39-2.jpg) ## Algorithm Examples -* scan: - - goal: produce a prefix sum array of a given array (we only care about exclusive scan here) - - input - - [1 5 0 1 2 0 3] - - output - - [0 1 6 6 7 9 9] -* compact: - - goal: closely and neatly packed the elements != 0 - - input - - [1 5 0 1 2 0 3] - - output - - [1 5 1 2 3] -* compactWithoutScan (CPU) - - an implementation of compact. So the goal, input and output should all be the same as compact - - Simply loop through the input array, meanwhile maintain a pointer indicating which address shall we put the next non-zero element -* compactWithScan (CPU/GPU) - - an implementation of compact. So the goal, input and output should all be the same as compact - - 3 steps - - map - + goal: map our original data array (integer, Light Ray, etc) to a bool array - + input +- scan: + - goal: produce a prefix sum array of a given array (we only care about exclusive scan here) + - input + - [1 5 0 1 2 0 3] + - output + - [0 1 6 6 7 9 9] +- compact: + - goal: closely and neatly packed the elements != 0 + - input - [1 5 0 1 2 0 3] - + output - - [1 1 0 1 1 0 1] - - scan - + take the output of last step as input - + input - - [1 1 0 1 1 0 1] - + output - - [0 1 2 2 3 4 4] - - scatter - + preserve non-zero elements and compact them into a new array - + input: - + original array - - [1 5 0 1 2 0 3] - + mapped array - - [1 1 0 1 1 0 1] - + scanned array - - [0 1 2 2 3 4 4] - + output: - - [1 5 1 2 3] - + This can be done in parallel on GPU - + You can try multi-threading on CPU if you want (not required and not our focus) - + for each element input[i] in original array - - if it's non-zero (given by mapped array) - - then put it at output[index], where index = scanned[i] + - output + - [1 5 1 2 3] +- compactWithoutScan (CPU) + - an implementation of compact. So the goal, input and output should all be the same as compact + - Simply loop through the input array, meanwhile maintain a pointer indicating which address shall we put the next non-zero element +- compactWithScan (CPU/GPU) + - an implementation of compact. So the goal, input and output should all be the same as compact + - 3 steps + - map + - goal: map our original data array (integer, Light Ray, etc) to a bool array + - input + - [1 5 0 1 2 0 3] + - output + - [1 1 0 1 1 0 1] + - scan + - take the output of last step as input + - input + - [1 1 0 1 1 0 1] + - output + - [0 1 2 2 3 4 4] + - scatter + - preserve non-zero elements and compact them into a new array + - input: + - original array + - [1 5 0 1 2 0 3] + - mapped array + - [1 1 0 1 1 0 1] + - scanned array + - [0 1 2 2 3 4 4] + - output: + - [1 5 1 2 3] + - This can be done in parallel on GPU + - You can try multi-threading on CPU if you want (not required and not our focus) + - for each element input[i] in original array + - if it's non-zero (given by mapped array) + - then put it at output[index], where index = scanned[i] diff --git a/README.md b/README.md index 0e38ddb1..f01dde9d 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,285 @@ -CUDA Stream Compaction -====================== +# CUDA Parallel Scan Algorithm and Stream Compaction Implementation -**University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** +University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 2 -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +- Amy Liu + - [Personal Website](https://amyliu.dev), [LinkedIn](https://linkedin.com/in/miyalana), [Github](https://github.com/mialana). +- Tested on: Fedora 42 KDE Plasma, Wayland Protocol, Optimus GPU (Intel(R) Core(TM) Ultra 9 275HX 32GiB, NVIDIA GeForce RTX 5070Ti 12227MiB) -### (TODO: Your README) +## Project Description -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +This project implements the **stream compaction** algorithm in CUDA. The goal of stream compaction is simply to remove `0`s from an array of integers. In a pathtracer, this can be used to remove terminated paths from the active ray pool, an important performance optimization. Furthermore, stream compaction relies on the execution of **scan** -- a **prefix sum** operation -- as a pre-processing step. +In this project, I implemented: + +- A **CPU baseline implementation** of scan and compaction. +- A **Naive CUDA scan** using repeated passes and global memory. +- A **Work-Efficient CUDA scan** using an up-sweep/down-sweep balanced tree method. +- A **CUDA stream compaction** method built on top of the work-efficient scan. + +## Implementations Breakdown + +### CPU Scan & Compaction + +- **Scan**: A quite straightforward exclusive prefix sum implementation using a sequential for-loop. +- **Compaction (without scan)**: Linearly traverses the array, writing only nonzero elements into the output. +- **Compaction (with scan)**: Converts the array to a boolean mask, performs a scan to compute write indices, and scatters nonzero values. Mirrors the CUDA design but slower due to sequential execution. + +### Naive CUDA Scan + +- Performs `ilog2ceil(n)` iterations. +- In each iteration, every thread reads from the input array and writes to a separate output array with an increasing offset. +- Arrays are swapped between passes. +- **Downside**: Requires multiple full passes over global memory. Performance is sensitive to block configuration and can hit _performance cliffs_ when register or memory usage reduces parallel occupancy. + +### Work-Efficient CUDA Scan + +- Uses the Blelloch scan: + 1. **Up-sweep (reduce)**: Build a binary tree of partial sums. + 2. **Down-sweep**: Propagate prefix sums back down the tree to form the exclusive scan. +- Can be performed in-place, avoiding extra arrays. +- Padding is used for non-power-of-two sizes. +- Much more efficient because each element is touched `O(log n)` times, rather than `O(n log n)` total work. + +### CUDA Stream Compaction + +- Built on the work-efficient scan: + 1. **Map** input values to booleans. + 2. **Scan** to compute indices. + 3. **Scatter** to compact values into a dense output. +- Fully parallel and scalable. + +### Thrust Scan + +- A wrapper around `thrust::exclusive_scan`. +- Provides correctness but introduces extra overhead (allocation, dispatch, abstraction layers). +- Useful as a reference implementation, but slower for most array sizes. + +--- + +## Performance Results + +1. Elapsed Time to Block Size (N = 262,144) + +For a fixed array size of 262,144, I measured how block size affects scan performance. + +| Block Size | CPU (ms) | Naive (ms) | Work-Efficient (ms) | Thrust (ms) | +| ---------- | -------- | ---------- | ------------------- | ----------- | +| 8 | 0.318709 | 1.22598 | 0.2816 | 2.09114 | +| 16 | 0.339354 | 0.790464 | 0.1536 | 2.05978 | +| 32 | 0.329103 | 0.580896 | 0.139808 | 1.20006 | +| 128 | 0.342237 | 2.14592 | 0.26032 | 2.06634 | +| 256 | 0.331082 | 0.4792 | 0.246752 | 2.13478 | +| 512 | 0.321641 | 2.85651 | 0.239296 | 2.04694 | +| 1024 | 0.335945 | 0.504672 | 0.26576 | 2.13779 | + +![scan blocksize](docs/assets/scan_blocksize.png) + +- Efficient scan was fastest at 32–512 threads, with an especially optimal spot near 32–256. +- **Note:** Naive scan was unstable due to _performance cliffs_ (detailed below). +- Thrust performance stayed relatively flat and consistently higher overhead than custom efficient scan. +- CPU time was insensitive to “block size” since it’s single-threaded, included as a baseline. + +--- + +2. Elapsed Time for Scan (Power-of-Two Sizes, Block Size of 16) + +| N | CPU (ms) | Naive (ms) | Work-Efficient (ms) | Thrust (ms) | +| ---------- | --------- | ---------- | ------------------- | ----------- | +| 64 | 0.000146 | 0.134752 | 0.102656 | 0.048384 | +| 256 | 0.000081 | 0.020032 | 0.035904 | 0.015776 | +| 1,024 | 0.000215 | 0.023872 | 0.044480 | 0.017056 | +| 4,096 | 0.000862 | 0.029248 | 0.052896 | 0.013088 | +| 16,384 | 0.003271 | 0.033248 | 0.063680 | 0.017504 | +| 65,536 | 0.013142 | 0.216288 | 0.078528 | 0.022432 | +| 262,144 | 0.132031 | 0.481088 | 0.203040 | 0.080640 | +| 1,048,576 | 1.756160 | 1.441090 | 0.383552 | 0.160512 | +| 4,194,304 | 3.965870 | 7.939580 | 0.653216 | 0.138400 | +| 16,777,216 | 10.041000 | 21.344200 | 2.688540 | 0.321024 | +| 67,108,864 | 35.037100 | 99.206900 | 11.000500 | 1.465890 | + +![scan performance](docs/assets/scan_performance.png) + +- CPU dominates at tiny N but scales linearly and becomes slow by ~16M+. +- Naive CUDA scan is competitive at ~1M–4M but grows less efficient due to repeated full passes over global memory. +- Work-Efficient scan consistently outperforms Naive for large N and shows best scalability. +- Thrust scan suffers heavy overhead and only “catches up” at huge sizes, still slower than custom efficient implementation. + +--- + +3. Elapsed Time for Scan (Non-Power-of-Two Sizes, Block Size of 16) + +| N | CPU (ms) | Naive (ms) | Work-Efficient (ms) | Thrust (ms) | +| ---------- | --------- | ---------- | ------------------- | ----------- | +| 61 | 0.000044 | 0.017664 | 0.029760 | 0.016928 | +| 253 | 0.000063 | 0.019936 | 0.036512 | 0.012896 | +| 1,021 | 0.000215 | 0.024704 | 0.044544 | 0.016256 | +| 4,093 | 0.000812 | 0.027968 | 0.074656 | 0.012768 | +| 16,381 | 0.003308 | 0.032448 | 0.060960 | 0.016160 | +| 65,533 | 0.012721 | 0.079456 | 0.078720 | 0.017376 | +| 262,141 | 0.196031 | 0.265952 | 0.205312 | 0.067072 | +| 1,048,573 | 1.340560 | 1.336480 | 0.283712 | 0.122976 | +| 4,194,301 | 3.897540 | 5.601060 | 0.649696 | 0.132608 | +| 16,777,213 | 7.871300 | 22.084500 | 2.671620 | 0.321632 | +| 67,108,861 | 29.286100 | 96.962200 | 14.517200 | 1.468770 | + +![scan non power of 2](docs/assets/scan_non_power2.png) + +- Padding logic works — work-efficient scan correctly handles arbitrary sizes. +- Performance trend mirrors power-of-two results. + +--- + +4. Elapsed Time for Compaction (Power-of-Two Sizes, Block Size of 16) + +| N | CPU (No Scan) (ms) | CPU (With Scan) (ms) | Work-Eff CUDA (ms) | +| ---------- | ------------------ | -------------------- | ------------------ | +| 64 | 0.000205 | 0.000428 | 0.079904 | +| 256 | 0.000573 | 0.000715 | 0.073536 | +| 1,024 | 0.001745 | 0.003360 | 0.084352 | +| 4,096 | 0.006417 | 0.012418 | 0.097120 | +| 16,384 | 0.024499 | 0.052908 | 0.183584 | +| 65,536 | 0.103375 | 0.264867 | 0.616352 | +| 262,144 | 0.412454 | 0.881458 | 1.587170 | +| 1,048,576 | 2.178310 | 5.862320 | 8.874690 | +| 4,194,304 | 7.599850 | 22.914500 | 21.235400 | +| 16,777,216 | 28.617200 | 85.529100 | 65.237300 | +| 67,108,864 | 113.149000 | 427.137000 | 253.671000 | + +![compaction performance](docs/assets/compaction_performance.png) + +- CPU methods are very fast until ~1M elements. +- Work-efficient CUDA compaction scales much better and overtakes CPU beyond ~1M. +- CPU compact-with-scan is always slower than CPU without-scan because of extra overhead, but matches GPU structure more closely. +- The longer CUDA elapsed times are most likely due to many CPU–GPU data transfers. + +--- + +## Discussion + +- **CPU vs GPU tradeoff**: + - CPU wins at small N (sub-65k). + - CUDA (work-efficient) dominates at large N. +- **Naive scan inefficiency**: + - Requires multiple kernel passes with global memory reads/writes. + - Very sensitive to block size: some configurations unexpectedly slow down. + - This is an example of a **performance cliff** — increasing resource usage (registers, memory, or block occupancy) can suddenly reduce parallelism. + - Unless the extra resource usage hides global memory latency, performance drops sharply. +- **Work-Efficient scan**: + - Best performer overall, both in correctness and scaling. + - Handles non-powers via padding without performance collapse. +- **Thrust scan**: + - Significant overhead from allocations and abstraction layers. + - Competitive only at very large N, but still slower than custom efficient implementation. + - When profiling thrust::exclusive_scan with Nsight, I believe the timeline shows significant overhead from extra kernel launches and temporary memory allocations inside Thrust. These allocations (device_vector construction, dispatch setup) are not present in the custom implementations and explain why Thrust lags behind for small and medium array sizes, despite its optimized kernels. For very large arrays, the cost of setup is amortized, and Thrust performs competitively. +- **Compaction**: + - Mirrors scan performance — CUDA methods need large N to be worthwhile. + - For small arrays, CPU is much cheaper due to launch overhead. +- **Extra Credit Part 5**: + - Confirmed GPU underutilization at deeper tree levels. + - Optimizations (adjusting thread launches) improved scaling. + - Demonstrates importance of occupancy tuning for CUDA algorithms. + +**Q: What is the performance bottleneck?** + +- For the naive scan: memory I/O and repeated kernel launches. +- For the efficient scan: thread underutilization in deeper tree levels. +- For Thrust: library overhead (allocations, internal dispatch). +- For CPU: bottleneck is purely computation, but with fast cache locality. + +**Q: Is memory I/O or computation dominant?** + +- Memory I/O dominates in CUDA scans due to repeated global memory accesses. +- Computation is relatively cheap compared to memory latency. + +--- + +### Code Testing Output Preview + +A quick preview of test results, running at array size = `2^28` and block size = `32`. +While developing the program, I added extra tests that allowed me to debug scan and compact implementation, which are labelled with a keyword "consecutive" below. + +- **Note:** At very large N, CPU prefix sums overflow 32-bit integers; this does not affect relative performance trends. + +``` +**************** +** SCAN TESTS ** +**************** +==== consecutive array (input) ==== +   [   0   1   2   3   4   5   6   7   8   9  10  11  12 ... 67108862 67108863 ] - count: 67108864 +==== cpu scan, power-of-two, consecutive-valued array ==== +  elapsed time: 38.698ms    (std::chrono Measured) +   [   0   0   1   3   6  10  15  21  28  36  45  55  66 ... -167772157 -100663295 ] - count: 67108864 +==== a array (input) ==== +   [  25  44  21  17  33   5  26  24   0  12   3  41  22 ...  18   0 ] - count: 67108864 +==== cpu scan, power-of-two ==== +  elapsed time: 37.6763ms    (std::chrono Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +==== cpu scan, non-power-of-two ==== +  elapsed time: 40.7901ms    (std::chrono Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318718 1644318745 ] - count: 67108861 +   passed   +==== naive scan, power-of-two ==== +  elapsed time: 99.19ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +   passed   +==== 1s array for finding bugs ==== +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +==== naive scan, non-power-of-two ==== +  elapsed time: 96.7852ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ...   0   0 ] - count: 67108864 +   passed   +==== work-efficient scan, power-of-two, consecutive-valued array ==== +  elapsed time: 11.1004ms    (CUDA Measured) +   [   0   0   1   3   6  10  15  21  28  36  45  55  66 ... -167772157 -100663295 ] - count: 67108864 +   passed   +==== work-efficient scan, power-of-two ==== +  elapsed time: 10.9568ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +   passed   +==== work-efficient scan, non-power-of-two ==== +  elapsed time: 16.7434ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318718 1644318745 ] - count: 67108861 +   passed   +==== thrust scan, power-of-two ==== +  elapsed time: 1.48512ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318833 1644318851 ] - count: 67108864 +   passed   +==== thrust scan, non-power-of-two ==== +  elapsed time: 1.48096ms    (CUDA Measured) +   [   0  25  69  90 107 140 145 171 195 195 207 210 251 ... 1644318718 1644318745 ] - count: 67108861 +   passed   + +***************************** +** STREAM COMPACTION TESTS ** +***************************** +   [   1   3   2   0   3   0   3   3   1   2   3   2   0 ...   0   0 ] - count: 67108864 +==== cpu compact without scan, power-of-two ==== +  elapsed time: 113.965ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   3   2 ] - count: 50326372 +   passed   +==== cpu compact without scan, non-power-of-two ==== +  elapsed time: 114.697ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   1   3 ] - count: 50326371 +   passed   +==== cpu compact with scan ==== +  elapsed time: 376.529ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   3   2 ] - count: 50326372 +   passed   +==== cpu compact with scan, non-power-of-two ==== +  elapsed time: 445.05ms    (std::chrono Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   1   3 ] - count: 50326371 +   passed   +==== work-efficient compact, power-of-two ==== +  elapsed time: 264.04ms    (CUDA Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   3   2 ] - count: 50326372 +   passed   +==== work-efficient compact, non-power-of-two ==== +  elapsed time: 262.06ms    (CUDA Measured) +   [   1   3   2   3   3   3   1   2   3   2   1   1   1 ...   1   3 ] - count: 50326371 +   passed +``` + +Also, I added some colors to the console output for more straight-forward debugging: + +![test output](docs/assets/test_output.png) diff --git a/cis565_stream_compaction_test.launch b/cis565_stream_compaction_test.launch deleted file mode 100644 index 4267429a..00000000 --- a/cis565_stream_compaction_test.launch +++ /dev/null @@ -1,27 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/assets/compaction_performance.png b/docs/assets/compaction_performance.png new file mode 100644 index 00000000..553272dd Binary files /dev/null and b/docs/assets/compaction_performance.png differ diff --git a/img/example-1.png b/docs/assets/example-1.png similarity index 100% rename from img/example-1.png rename to docs/assets/example-1.png diff --git a/img/example-2.jpg b/docs/assets/example-2.jpg similarity index 100% rename from img/example-2.jpg rename to docs/assets/example-2.jpg diff --git a/img/figure-39-2.jpg b/docs/assets/figure-39-2.jpg similarity index 100% rename from img/figure-39-2.jpg rename to docs/assets/figure-39-2.jpg diff --git a/img/figure-39-4.jpg b/docs/assets/figure-39-4.jpg similarity index 100% rename from img/figure-39-4.jpg rename to docs/assets/figure-39-4.jpg diff --git a/docs/assets/scan_blocksize.png b/docs/assets/scan_blocksize.png new file mode 100644 index 00000000..ec6863bf Binary files /dev/null and b/docs/assets/scan_blocksize.png differ diff --git a/docs/assets/scan_non_power2.png b/docs/assets/scan_non_power2.png new file mode 100644 index 00000000..2a8ccd51 Binary files /dev/null and b/docs/assets/scan_non_power2.png differ diff --git a/docs/assets/scan_performance.png b/docs/assets/scan_performance.png new file mode 100644 index 00000000..59187253 Binary files /dev/null and b/docs/assets/scan_performance.png differ diff --git a/docs/assets/test_output.png b/docs/assets/test_output.png new file mode 100644 index 00000000..8b722958 Binary files /dev/null and b/docs/assets/test_output.png differ diff --git a/scripts/format.ps1 b/scripts/format.ps1 new file mode 100644 index 00000000..06c8d86b --- /dev/null +++ b/scripts/format.ps1 @@ -0,0 +1,3 @@ +Get-ChildItem -Recurse -Path tests, stream_compaction ` + -Include *.h, *.cpp, *.inl, *.cu | + ForEach-Object { clang-format -i $_.FullName} diff --git a/scripts/graph_13.py b/scripts/graph_13.py new file mode 100644 index 00000000..e9010a88 --- /dev/null +++ b/scripts/graph_13.py @@ -0,0 +1,73 @@ +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.ticker import EngFormatter + +formatter = EngFormatter(unit="") + +# ------------------------- +# Non-power-of-two Scan Results +# ------------------------- +scan_sizes = np.array([61, 253, 1021, 4093, 16381, 65533, + 262141, 1048573, 4194301, 16777213, 67108861]) +cpu_scan = [0.000044, 0.000063, 0.000215, 0.000812, 0.003308, + 0.012721, 0.196031, 1.340560, 3.897540, 7.871300, 29.286100] +naive_scan = [0.017664, 0.019936, 0.024704, 0.027968, 0.032448, + 0.079456, 0.265952, 1.336480, 5.601060, 22.084500, 96.962200] +eff_scan = [0.029760, 0.036512, 0.044544, 0.074656, 0.060960, + 0.078720, 0.205312, 0.283712, 0.649696, 2.671620, 14.517200] +thrust_scan = [0.016928, 0.012896, 0.016256, 0.012768, 0.016160, + 0.017376, 0.067072, 0.122976, 0.132608, 0.321632, 1.468770] + +plt.figure(figsize=(8,6)) +plt.plot(scan_sizes, cpu_scan, marker='o', label="CPU Scan") +plt.plot(scan_sizes, naive_scan, marker='o', label="Naive GPU Scan") +plt.plot(scan_sizes, eff_scan, marker='o', label="Work-Efficient GPU Scan") +plt.plot(scan_sizes, thrust_scan, marker='o', label="Thrust Scan") + +plt.xscale("log") +plt.yscale("log") +plt.xlabel("Array Size (N)") +plt.ylabel("Time (ms)") +plt.title("Scan Performance (Non-Power-of-Two)") +plt.legend() +plt.grid(True, which="both", ls="--", linewidth=0.5) +plt.gca().xaxis.set_major_formatter(formatter) +plt.ticklabel_format(style='plain', axis='x') + +plt.tight_layout() +plt.savefig("scan_non_power2.png", dpi=200) + +# ------------------------- +# Block Size Sweep (N = 262,144) +# ------------------------- +block_sizes = np.array([8, 16, 32, 128, 256, 512, 1024]) + +cpu_block = [0.318709, 0.339354, 0.329103, 0.342237, + 0.331082, 0.321641, 0.335945] + +naive_block = [1.225980, 0.790464, 0.580896, 2.145920, + 0.479200, 2.856510, 0.504672] + +eff_block = [0.281600, 0.153600, 0.139808, 0.260320, + 0.246752, 0.239296, 0.265760] + +thrust_block = [2.091140, 2.059780, 1.200060, 2.066340, + 2.134780, 2.046940, 2.137790] + + +plt.figure(figsize=(8,6)) +plt.plot(block_sizes, cpu_block, marker='o', label="CPU Scan") +plt.plot(block_sizes, naive_block, marker='o', label="Naive GPU Scan") +plt.plot(block_sizes, eff_block, marker='o', label="Work-Efficient GPU Scan") +plt.plot(block_sizes, thrust_block, marker='o', label="Thrust Scan") + +plt.xlabel("Block Size") +plt.ylabel("Time (ms)") +plt.title("Elapsed Time to Block Size (N=262,144, less is better)") +plt.legend() +plt.grid(True, ls="--", linewidth=0.5) + +plt.tight_layout() +plt.savefig("scan_blocksize.png", dpi=200) + +plt.show() diff --git a/scripts/graph_24.py b/scripts/graph_24.py new file mode 100644 index 00000000..ec60e6ac --- /dev/null +++ b/scripts/graph_24.py @@ -0,0 +1,82 @@ +import matplotlib.pyplot as plt +import numpy as np +from matplotlib.ticker import EngFormatter + +formatter = EngFormatter(unit="") # no extra unit + +# ------------------------- +# Data from your tables +# ------------------------- + +# Power-of-two scan results +scan_sizes = np.array([64, 256, 1024, 4096, 16384, 65536, + 262144, 1048576, 4194304, 16777216, 67108864]) +cpu_scan = [0.000146, 0.000081, 0.000215, 0.000862, 0.003271, + 0.013142, 0.132031, 1.756160, 3.965870, 10.041000, 35.037100] +naive_scan = [0.134752, 0.020032, 0.023872, 0.029248, 0.033248, + 0.216288, 0.481088, 1.441090, 7.939580, 21.344200, 99.206900] +eff_scan = [0.102656, 0.035904, 0.044480, 0.052896, 0.063680, + 0.078528, 0.203040, 0.383552, 0.653216, 2.688540, 11.000500] +thrust_scan = [0.048384, 0.015776, 0.017056, 0.013088, 0.017504, + 0.022432, 0.080640, 0.160512, 0.138400, 0.321024, 1.465890] + +# Power-of-two compaction results +comp_sizes = np.array([64, 256, 1024, 4096, 16384, 65536, + 262144, 1048576, 4194304, 16777216, 67108864]) +cpu_no_scan = [0.000205, 0.000573, 0.001745, 0.006417, 0.024499, + 0.103375, 0.412454, 2.178310, 7.599850, 28.617200, 113.149000] +cpu_with_scan = [0.000428, 0.000715, 0.003360, 0.012418, 0.052908, + 0.264867, 0.881458, 5.862320, 22.914500, 85.529100, 427.137000] +gpu_compact = [0.079904, 0.073536, 0.084352, 0.097120, 0.183584, + 0.616352, 1.587170, 8.874690, 21.235400, 65.237300, 253.671000] + +# ------------------------- +# Plot Scan Results +# ------------------------- +plt.figure(figsize=(8,6)) +plt.plot(scan_sizes, cpu_scan, marker='o', label="CPU Scan") +plt.plot(scan_sizes, naive_scan, marker='o', label="Naive GPU Scan") +plt.plot(scan_sizes, eff_scan, marker='o', label="Work-Efficient GPU Scan") +plt.plot(scan_sizes, thrust_scan, marker='o', label="Thrust Scan") + +plt.xscale("log") +plt.yscale("log") +plt.xlabel("Array Size (N)") +plt.ylabel("Time (ms)") +plt.title("Scan Performance") +plt.legend() +plt.grid(True, which="both", ls="--", linewidth=0.5) +plt.tight_layout() + +# For Scan plot +plt.gca().xaxis.set_major_formatter(formatter) +# plt.gca().yaxis.set_major_formatter(formatter) +plt.ticklabel_format(style='plain', axis='x') +# plt.ticklabel_format(style='plain', axis='y') +plt.savefig("scan_performance.png", dpi=200) + +# ------------------------- +# Plot Compaction Results +# ------------------------- +plt.figure(figsize=(8,6)) +plt.plot(comp_sizes, cpu_no_scan, marker='o', label="CPU Compact (No Scan)") +plt.plot(comp_sizes, cpu_with_scan, marker='o', label="CPU Compact (With Scan)") +plt.plot(comp_sizes, gpu_compact, marker='o', label="Work-Efficient GPU Compact") + +plt.xscale("log") +plt.yscale("log") +plt.xlabel("Array Size (N)") +plt.ylabel("Time (ms)") +plt.title("Compaction Performance") +plt.legend() +plt.grid(True, which="both", ls="--", linewidth=0.5) +plt.tight_layout() + +# For Compaction plot +plt.gca().xaxis.set_major_formatter(formatter) +# plt.gca().yaxis.set_major_formatter(formatter) +plt.ticklabel_format(style='plain', axis='x') +# plt.ticklabel_format(style='plain', axis='y') +plt.savefig("compaction_performance.png", dpi=200) + +plt.show() diff --git a/src/main.cpp b/src/main.cpp deleted file mode 100644 index 3d5c8820..00000000 --- a/src/main.cpp +++ /dev/null @@ -1,154 +0,0 @@ -/** - * @file main.cpp - * @brief Stream compaction test program - * @authors Kai Ninomiya - * @date 2015 - * @copyright University of Pennsylvania - */ - -#include -#include -#include -#include -#include -#include "testing_helpers.hpp" - -const int SIZE = 1 << 8; // feel free to change the size of array -const int NPOT = SIZE - 3; // Non-Power-Of-Two -int *a = new int[SIZE]; -int *b = new int[SIZE]; -int *c = new int[SIZE]; - -int main(int argc, char* argv[]) { - // Scan tests - - printf("\n"); - printf("****************\n"); - printf("** SCAN TESTS **\n"); - printf("****************\n"); - - genArray(SIZE - 1, a, 50); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - // initialize b using StreamCompaction::CPU::scan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::scan is correct. - // At first all cases passed because b && c are all zeroes. - zeroArray(SIZE, b); - printDesc("cpu scan, power-of-two"); - StreamCompaction::CPU::scan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(SIZE, b, true); - - zeroArray(SIZE, c); - printDesc("cpu scan, non-power-of-two"); - StreamCompaction::CPU::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("naive scan, power-of-two"); - StreamCompaction::Naive::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - /* For bug-finding only: Array of 1s to help find bugs in stream compaction or scan - onesArray(SIZE, c); - printDesc("1s array for finding bugs"); - StreamCompaction::Naive::scan(SIZE, c, a); - printArray(SIZE, c, true); */ - - zeroArray(SIZE, c); - printDesc("naive scan, non-power-of-two"); - StreamCompaction::Naive::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Naive::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, power-of-two"); - StreamCompaction::Efficient::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient scan, non-power-of-two"); - StreamCompaction::Efficient::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, power-of-two"); - StreamCompaction::Thrust::scan(SIZE, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(SIZE, c, true); - printCmpResult(SIZE, b, c); - - zeroArray(SIZE, c); - printDesc("thrust scan, non-power-of-two"); - StreamCompaction::Thrust::scan(NPOT, c, a); - printElapsedTime(StreamCompaction::Thrust::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(NPOT, c, true); - printCmpResult(NPOT, b, c); - - printf("\n"); - printf("*****************************\n"); - printf("** STREAM COMPACTION TESTS **\n"); - printf("*****************************\n"); - - // Compaction tests - - genArray(SIZE - 1, a, 4); // Leave a 0 at the end to test that edge case - a[SIZE - 1] = 0; - printArray(SIZE, a, true); - - int count, expectedCount, expectedNPOT; - - // initialize b using StreamCompaction::CPU::compactWithoutScan you implement - // We use b for further comparison. Make sure your StreamCompaction::CPU::compactWithoutScan is correct. - zeroArray(SIZE, b); - printDesc("cpu compact without scan, power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(SIZE, b, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedCount = count; - printArray(count, b, true); - printCmpLenResult(count, expectedCount, b, b); - - zeroArray(SIZE, c); - printDesc("cpu compact without scan, non-power-of-two"); - count = StreamCompaction::CPU::compactWithoutScan(NPOT, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - expectedNPOT = count; - printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - zeroArray(SIZE, c); - printDesc("cpu compact with scan"); - count = StreamCompaction::CPU::compactWithScan(SIZE, c, a); - printElapsedTime(StreamCompaction::CPU::timer().getCpuElapsedTimeForPreviousOperation(), "(std::chrono Measured)"); - printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, power-of-two"); - count = StreamCompaction::Efficient::compact(SIZE, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedCount, b, c); - - zeroArray(SIZE, c); - printDesc("work-efficient compact, non-power-of-two"); - count = StreamCompaction::Efficient::compact(NPOT, c, a); - printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)"); - //printArray(count, c, true); - printCmpLenResult(count, expectedNPOT, b, c); - - system("pause"); // stop Win32 console from closing on exit - delete[] a; - delete[] b; - delete[] c; -} diff --git a/src/testing_helpers.hpp b/src/testing_helpers.hpp deleted file mode 100644 index 025e94aa..00000000 --- a/src/testing_helpers.hpp +++ /dev/null @@ -1,76 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include - -template -int cmpArrays(int n, T *a, T *b) { - for (int i = 0; i < n; i++) { - if (a[i] != b[i]) { - printf(" a[%d] = %d, b[%d] = %d\n", i, a[i], i, b[i]); - return 1; - } - } - return 0; -} - -void printDesc(const char *desc) { - printf("==== %s ====\n", desc); -} - -template -void printCmpResult(int n, T *a, T *b) { - printf(" %s \n", - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); -} - -template -void printCmpLenResult(int n, int expN, T *a, T *b) { - if (n != expN) { - printf(" expected %d elements, got %d\n", expN, n); - } - printf(" %s \n", - (n == -1 || n != expN) ? "FAIL COUNT" : - cmpArrays(n, a, b) ? "FAIL VALUE" : "passed"); -} - -void zeroArray(int n, int *a) { - for (int i = 0; i < n; i++) { - a[i] = 0; - } -} - -void onesArray(int n, int *a) { - for (int i = 0; i < n; i++) { - a[i] = 1; - } -} - -void genArray(int n, int *a, int maxval) { - srand(time(nullptr)); - - for (int i = 0; i < n; i++) { - a[i] = rand() % maxval; - } -} - -void printArray(int n, int *a, bool abridged = false) { - printf(" [ "); - for (int i = 0; i < n; i++) { - if (abridged && i + 2 == 15 && n > 16) { - i = n - 2; - printf("... "); - } - printf("%3d ", a[i]); - } - printf("]\n"); -} - -template -void printElapsedTime(T time, std::string note = "") -{ - std::cout << " elapsed time: " << time << "ms " << note << std::endl; -} diff --git a/stream_compaction/CMakeLists.txt b/stream_compaction/CMakeLists.txt index 19511caa..834446d4 100644 --- a/stream_compaction/CMakeLists.txt +++ b/stream_compaction/CMakeLists.txt @@ -1,32 +1,39 @@ -set(headers - "common.h" - "cpu.h" - "naive.h" - "efficient.h" - "thrust.h" - ) - -set(sources - "common.cu" - "cpu.cu" - "naive.cu" - "efficient.cu" - "thrust.cu" - ) - -list(SORT headers) -list(SORT sources) - -source_group(Headers FILES ${headers}) -source_group(Sources FILES ${sources}) - -add_library(stream_compaction ${sources} ${headers}) -if(CMAKE_VERSION VERSION_LESS "3.23.0") - set_target_properties(stream_compaction} PROPERTIES CUDA_ARCHITECTURES OFF) -elseif(CMAKE_VERSION VERSION_LESS "3.24.0") - set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES all-major) -else() - set_target_properties(stream_compaction PROPERTIES CUDA_ARCHITECTURES native) -endif() -target_compile_options(stream_compaction PRIVATE "$<$,$>:-G;-src-in-ptx>") -target_compile_options(stream_compaction PRIVATE "$<$,$>:-lineinfo;-src-in-ptx>") +find_package(CUDAToolkit REQUIRED) + +set(LIB_HEADERS + "common.h" + "cpu.h" + "naive.h" + "efficient.h" + "thrust_wrapper.h" + "radix.h" + "shared.h" +) + +set(LIB_SOURCES + "common.cu" + "cpu.cu" + "naive.cu" + "efficient.cu" + "thrust_wrapper.cu" + "radix.cu" + "shared.cu" +) + +add_library(${PROJECT_NAME} SHARED ${LIB_SOURCES} ${LIB_HEADERS}) + +target_include_directories(${PROJECT_NAME} PUBLIC + "$" # one directory up +) + +find_package(CCCL REQUIRED CONFIG) +target_link_libraries(${PROJECT_NAME} PUBLIC CCCL::CCCL CUDA::cudart) + +set_target_properties(${PROJECT_NAME} PROPERTIES CUDA_ARCHITECTURES "native") + +target_compile_options(${PROJECT_NAME} PRIVATE "$<$,$>:-G;-src-in-ptx;--extended-lambda>") + +target_compile_options(${PROJECT_NAME} PRIVATE "$<$,$>:-lineinfo;--extended-lambda>") + +# create a namespaced alias to distinguish what is actually linked against +add_library(${PROJECT_NAME}::${PROJECT_NAME} ALIAS ${PROJECT_NAME}) \ No newline at end of file diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d630..e900cd17 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -1,39 +1,60 @@ #include "common.h" -void checkCUDAErrorFn(const char *msg, const char *file, int line) { +void check_cuda_error_fn(const char* msg, const char* file, int line) +{ cudaError_t err = cudaGetLastError(); - if (cudaSuccess == err) { - return; - } + if (cudaSuccess == err) return; fprintf(stderr, "CUDA error"); - if (file) { - fprintf(stderr, " (%s:%d)", file, line); - } + + if (file) fprintf(stderr, " (%s:%d)", file, line); + fprintf(stderr, ": %s: %s\n", msg, cudaGetErrorString(err)); exit(EXIT_FAILURE); } +namespace stream_compaction::common +{ -namespace StreamCompaction { - namespace Common { +__device__ int kernel_compute_global_index_1d() +{ return static_cast((blockIdx.x * blockDim.x) + threadIdx.x); } - /** - * Maps an array to an array of 0s and 1s for stream compaction. Elements - * which map to 0 will be removed, and elements which map to 1 will be kept. - */ - __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { - // TODO - } +/** + * Maps an array to an array of 0s and 1s for stream compaction. Elements + * which map to 0 will be removed, and elements which map to 1 will be kept. + */ +__global__ void kernel_map_to_boolean(int n, const int* idata, int* out_bools) +{ + int index = kernel_compute_global_index_1d(); - /** - * Performs scatter on an array. That is, for each element in idata, - * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. - */ - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices) { - // TODO - } + if (index >= n) return; - } + out_bools[index] = idata[index] == 0 ? 0 : 1; } + +__global__ void kernel_scatter(int n, const int* bools, const int* indices, const int* idata, + int* odata) +{ + int index = kernel_compute_global_index_1d(); + + if (index >= n) return; + + if (bools[index] == 1) odata[indices[index]] = idata[index]; +} + +__global__ void kernel_inclusive_to_exclusive(int n, int identity, const int* idata, int* odata) +{ + int index = kernel_compute_global_index_1d(); + + if (index >= n) return; + + if (index == 0) odata[index] = identity; + else odata[index] = idata[index - 1]; +} + +__global__ void kernel_set_device_array_value(int* arr, int index, int value) +{ + arr[index] = value; // round up to nearest power of two +} + +} // namespace stream_compaction::common diff --git a/stream_compaction/common.h b/stream_compaction/common.h index d2c1fed9..36010b69 100644 --- a/stream_compaction/common.h +++ b/stream_compaction/common.h @@ -3,130 +3,182 @@ #include #include -#include -#include -#include -#include #include #include #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) -#define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__) +#define CUDA_CHECK(call) \ + do \ + { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) \ + { \ + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ + cudaGetErrorString(err)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#define CUDA_KERNEL_CHECK() \ + do \ + { \ + CUDA_CHECK(cudaGetLastError()); \ + } while (0) + +constexpr int kBLOCK_SIZE = 128; + +// Check for CUDA errors; print and exit if there was a problem. +void check_cuda_error_fn(const char* msg, const char* file = nullptr, int line = -1); + +inline int divup(int size, int div) +{ return (size + div - 1) / div; } + +inline int ilog2(int x) +{ + int lg = 0; + while (x >>= 1) + ++lg; + return lg; +} + +// calculates smallest possible integer k such that 2^k >= x +// subtracts x from 1 in the case that we already have a power of 2 +inline int ilog2_ceil(int x) +{ return x == 1 ? 0 : ilog2(x - 1) + 1; } + +namespace stream_compaction::common +{ +__device__ int kernel_compute_global_index_1d(); + +__global__ void kernel_map_to_boolean(int n, const int* idata, int* out_bools); /** - * Check for CUDA errors; print and exit if there was a problem. + * Performs scatter on an array. That is, for each element in idata, + * if bools[idx] == 1, it copies idata[idx] to odata[indices[idx]]. */ -void checkCUDAErrorFn(const char *msg, const char *file = NULL, int line = -1); +__global__ void kernel_scatter(int n, const int* bools, const int* indices, const int* idata, + int* odata); -inline int ilog2(int x) { - int lg = 0; - while (x >>= 1) { - ++lg; +__global__ void kernel_inclusive_to_exclusive(int n, int identity, const int* idata, int* odata); + +__global__ void kernel_set_device_array_value(int* arr, int index, int value); + +enum class eTimerDevice +{ + CPU, + GPU +}; + +/** + * This class is used for timing the performance + * Uncopyable and unmovable + * + * Adapted from WindyDarian(https://github.com/WindyDarian) + */ +class PerformanceTimer +{ +public: + bool cpu_timer_started = false; + bool gpu_timer_started = false; + + PerformanceTimer() + { + cudaEventCreate(&_event_start); + cudaEventCreate(&_event_end); } - return lg; -} -inline int ilog2ceil(int x) { - return x == 1 ? 0 : ilog2(x - 1) + 1; -} + ~PerformanceTimer() + { + cudaEventDestroy(_event_start); + cudaEventDestroy(_event_end); + } -namespace StreamCompaction { - namespace Common { - __global__ void kernMapToBoolean(int n, int *bools, const int *idata); + template + void start_timer() + { + if constexpr (Device == eTimerDevice::CPU) + { + if (cpu_timer_started) throw std::runtime_error("CPU timer already started"); + cpu_timer_started = true; - __global__ void kernScatter(int n, int *odata, - const int *idata, const int *bools, const int *indices); + _time_start_cpu = std::chrono::high_resolution_clock::now(); + } + else + { + if (gpu_timer_started) throw std::runtime_error("GPU timer already started"); + gpu_timer_started = true; - /** - * This class is used for timing the performance - * Uncopyable and unmovable - * - * Adapted from WindyDarian(https://github.com/WindyDarian) - */ - class PerformanceTimer + cudaEventRecord(_event_start); + } + } + + template + void end_timer() + { + if constexpr (Device == eTimerDevice::CPU) + { + _time_end_cpu = std::chrono::high_resolution_clock::now(); + + if (!cpu_timer_started) throw std::runtime_error("CPU timer not started"); + + std::chrono::duration duro = _time_end_cpu - _time_start_cpu; + _prev_elapsed_time_cpu_milliseconds + = static_cast(duro.count()); + + cpu_timer_started = false; + } + else { - public: - PerformanceTimer() - { - cudaEventCreate(&event_start); - cudaEventCreate(&event_end); - } - - ~PerformanceTimer() - { - cudaEventDestroy(event_start); - cudaEventDestroy(event_end); - } - - void startCpuTimer() - { - if (cpu_timer_started) { throw std::runtime_error("CPU timer already started"); } - cpu_timer_started = true; - - time_start_cpu = std::chrono::high_resolution_clock::now(); - } - - void endCpuTimer() - { - time_end_cpu = std::chrono::high_resolution_clock::now(); - - if (!cpu_timer_started) { throw std::runtime_error("CPU timer not started"); } - - std::chrono::duration duro = time_end_cpu - time_start_cpu; - prev_elapsed_time_cpu_milliseconds = - static_cast(duro.count()); - - cpu_timer_started = false; - } - - void startGpuTimer() - { - if (gpu_timer_started) { throw std::runtime_error("GPU timer already started"); } - gpu_timer_started = true; - - cudaEventRecord(event_start); - } - - void endGpuTimer() - { - cudaEventRecord(event_end); - cudaEventSynchronize(event_end); - - if (!gpu_timer_started) { throw std::runtime_error("GPU timer not started"); } - - cudaEventElapsedTime(&prev_elapsed_time_gpu_milliseconds, event_start, event_end); - gpu_timer_started = false; - } - - float getCpuElapsedTimeForPreviousOperation() //noexcept //(damn I need VS 2015 - { - return prev_elapsed_time_cpu_milliseconds; - } - - float getGpuElapsedTimeForPreviousOperation() //noexcept - { - return prev_elapsed_time_gpu_milliseconds; - } - - // remove copy and move functions - PerformanceTimer(const PerformanceTimer&) = delete; - PerformanceTimer(PerformanceTimer&&) = delete; - PerformanceTimer& operator=(const PerformanceTimer&) = delete; - PerformanceTimer& operator=(PerformanceTimer&&) = delete; - - private: - cudaEvent_t event_start = nullptr; - cudaEvent_t event_end = nullptr; - - using time_point_t = std::chrono::high_resolution_clock::time_point; - time_point_t time_start_cpu; - time_point_t time_end_cpu; - - bool cpu_timer_started = false; - bool gpu_timer_started = false; - - float prev_elapsed_time_cpu_milliseconds = 0.f; - float prev_elapsed_time_gpu_milliseconds = 0.f; - }; + cudaEventRecord(_event_end); + cudaEventSynchronize(_event_end); + + if (!gpu_timer_started) throw std::runtime_error("GPU timer not started"); + + cudaEventElapsedTime(&_prev_elapsed_time_gpu_milliseconds, _event_start, _event_end); + gpu_timer_started = false; + } } -} + + template + inline void flush() const + { + float elapsed_time; + const char* timer_device_string; + if constexpr (Device == eTimerDevice::CPU) + { + elapsed_time = _prev_elapsed_time_cpu_milliseconds; + timer_device_string = "CPU"; + } + else + { + elapsed_time = _prev_elapsed_time_gpu_milliseconds; + timer_device_string = "GPU"; + } + printf("\tELAPSED TIME: %fms (%s)\n", elapsed_time, timer_device_string); + } + + template + [[nodiscard]] float get_elapsed_time_for_previous_operation() const + { + if constexpr (Device == eTimerDevice::CPU) return _prev_elapsed_time_cpu_milliseconds; + else return _prev_elapsed_time_gpu_milliseconds; + } + + // remove copy and move functions + PerformanceTimer(const PerformanceTimer&) = delete; + PerformanceTimer(PerformanceTimer&&) = delete; + PerformanceTimer& operator=(const PerformanceTimer&) = delete; + PerformanceTimer& operator=(PerformanceTimer&&) = delete; + +private: + cudaEvent_t _event_start = nullptr; + cudaEvent_t _event_end = nullptr; + + using time_point_t = std::chrono::high_resolution_clock::time_point; + time_point_t _time_start_cpu; + time_point_t _time_end_cpu; + + float _prev_elapsed_time_cpu_milliseconds = 0.f; + float _prev_elapsed_time_gpu_milliseconds = 0.f; +}; +} // namespace stream_compaction::common diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa115..c5f019a3 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -1,50 +1,85 @@ -#include #include "cpu.h" #include "common.h" -namespace StreamCompaction { - namespace CPU { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } +namespace stream_compaction::cpu +{ +using enum common::eTimerDevice; +using common::PerformanceTimer; - /** - * CPU scan (prefix sum). - * For performance analysis, this is supposed to be a simple for loop. - * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan in this function first. - */ - void scan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - } +PerformanceTimer& get_timer() +{ + static PerformanceTimer timer; + return timer; +} - /** - * CPU stream compaction without using the scan function. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithoutScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; - } +/** + * CPU scan (prefix sum). + * For performance analysis, this is supposed to be a simple for loop. + * (Optional) For better understanding before starting moving to GPU, you can simulate your GPU scan + * in this function first. + */ +void scan(int n, const int* idata, int* odata) +{ + get_timer().start_timer(); + + int prev_sum = 0; // save prev sum for access ease + for (int j = 0; j < n; j++) + { + odata[j] = prev_sum; + prev_sum += idata[j]; + } + + get_timer().end_timer(); +} + +/** + * CPU stream compaction without using the scan function. + * + * @returns the number of elements remaining after compaction. + */ +int compact_without_scan(int n, const int* idata, int* odata) +{ + get_timer().start_timer(); + + int out_index = 0; // pointer to current progress in out array - /** - * CPU stream compaction using scan and scatter, like the parallel version. - * - * @returns the number of elements remaining after compaction. - */ - int compactWithScan(int n, int *odata, const int *idata) { - timer().startCpuTimer(); - // TODO - timer().endCpuTimer(); - return -1; + for (int i = 0; i < n; i++) + { + int in_val = idata[i]; + if (in_val != 0) + { + odata[out_index] = in_val; + out_index++; } } + + get_timer().end_timer(); + return out_index; +} + +/** + * CPU stream compaction using scan and scatter, like the parallel version. + * + * @returns the number of elements remaining after compaction. + */ +int compact_with_scan(int n, const int* idata, int* odata) +{ + get_timer().start_timer(); + + int* is_not_zero = new int[n]; + int* scan_is_not_zero = new int[n]; + + for (int i = 0; i < n; i++) + is_not_zero[i] = idata[i] != 0 ? 1 : 0; // val is 1 at i if idata[i] != 0, else 0 + + scan(n, is_not_zero, scan_is_not_zero); // scan result is index in final array + + for (int i = 0; i < n; i++) + if (is_not_zero[i]) odata[scan_is_not_zero[i]] = idata[i]; + + get_timer().end_timer(); + + return scan_is_not_zero[n - 1] + is_not_zero[n - 1]; // due to exclusive scan } +} // namespace stream_compaction::cpu diff --git a/stream_compaction/cpu.h b/stream_compaction/cpu.h index 873c0476..7d17e93e 100644 --- a/stream_compaction/cpu.h +++ b/stream_compaction/cpu.h @@ -2,14 +2,13 @@ #include "common.h" -namespace StreamCompaction { - namespace CPU { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace stream_compaction::cpu +{ +stream_compaction::common::PerformanceTimer& get_timer(); - void scan(int n, int *odata, const int *idata); +void scan(int n, const int* idata, int* odata); - int compactWithoutScan(int n, int *odata, const int *idata); +int compact_without_scan(int n, const int* idata, int* odata); - int compactWithScan(int n, int *odata, const int *idata); - } -} +int compact_with_scan(int n, const int* idata, int* odata); +} // namespace stream_compaction::cpu diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346ee..b4b6833a 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,38 +3,206 @@ #include "common.h" #include "efficient.h" -namespace StreamCompaction { - namespace Efficient { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } - - /** - * Performs stream compaction on idata, storing the result into odata. - * All zeroes are discarded. - * - * @param n The number of elements in idata. - * @param odata The array into which to store elements. - * @param idata The array of elements to compact. - * @returns The number of elements remaining after compaction. - */ - int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - return -1; - } +namespace stream_compaction::efficient +{ + +using enum common::eTimerDevice; +using common::PerformanceTimer; + +PerformanceTimer& efficient::get_timer() +{ + static PerformanceTimer timer; + return timer; +} + +__global__ void kernel_efficient_up_sweep(int padded_n, int stride, int prev_stride, int* scan) +{ + int stride_idx = common::kernel_compute_global_index_1d(); // 0, 1, 2, 3... (like normal) + // but this is not target elem index + + int stride_start = stride_idx * stride; // index where this stride starts + + // last index in stride. accumulated value of stride always goes here + int accumulator_idx = stride_start + stride - 1; + + if (accumulator_idx >= padded_n) return; + + int accumulator = scan[accumulator_idx]; // pre-fetch accumulator's value + + // this new stride has swallowed two strides total + // siblingIdx is the index of the other stride that now no longer exists + int sibling_idx = stride_start + prev_stride - 1; // doesn't depend on accumulator + + scan[accumulator_idx] = accumulator + scan[sibling_idx]; +} + +__global__ void kernel_efficient_down_sweep(int padded_n, int stride, + int next_stride, // nextStride == (stride / 2) + int* scan) +{ + int stride_idx = common::kernel_compute_global_index_1d(); + + int stride_start = stride_idx * stride; + + int right_child_idx = stride_start + stride - 1; + if (right_child_idx >= padded_n) return; + + int right_child = scan[right_child_idx]; + + // leftChild and rightChild are nextSTRIDE indices apart + int left_child_idx = stride_start + next_stride - 1; + int left_child = scan[left_child_idx]; // does not depend on first memory read + + // give left child right child's value + // its value has not changed since the end of upsweep + // it has it easier than right child. + // on this update it now has accumulated vals of all strides of size next_stride, besides its own + scan[left_child_idx] = right_child; // depends on first read, but not second + + // right child currently contains accumulated vals of all strides of size stride besodes its own + // adding the left child, which only contains values of one stride of size next_stride + // means that right child now also has accumulated vals of all strides of size next_stride + // besides its own (same status as left_child) + scan[right_child_idx] = right_child + left_child; // memory writes do not depend on each other + + // summary: at each layer, the updated elements get the value of all strides of size next_stride + // besides its own. + // so when next_stride == 1, then this element is done, and so are our iterations +} + +/* + the inner operation of scan without timers and allocation. + note: dev_scan should be pre-allocated to the padded power of two size +*/ +void scan(int n, const int block_size, int* dev_scan) +{ + int num_layers = ilog2_ceil(n); + int padded_n = 1 << num_layers; // pad to nearest power of 2 + + int prev_stride = 1; // 1, 2, 4, 8, ... n/2 + int stride = 2; // essentially the amount of indices that are accumulated into 1 at this iter + // 2, 4, 8, ... n + for (int iter = 0; iter < num_layers; iter++) + { + // n/2, n/4, n/8, ... 1 + int blocks = divup(padded_n >> (iter + 1), kBLOCK_SIZE); + kernel_efficient_up_sweep<<>>(padded_n, stride, prev_stride, dev_scan); + CUDA_KERNEL_CHECK(); + + prev_stride = stride; + stride = stride <<= 1; + } + + // set last value of dev_scan to 0 + int replacement = 0; + CUDA_CHECK( + cudaMemcpy(&dev_scan[padded_n - 1], &replacement, sizeof(int), cudaMemcpyHostToDevice)); + + stride = static_cast(padded_n); // n, n/2, n/4, ... 2 + int next_stride = stride >> 1; // n/2, n/4, ... 1 + for (int iter = num_layers; iter > 0; iter--) + { + int blocks = divup(padded_n >> iter, kBLOCK_SIZE); + kernel_efficient_down_sweep<<>>(padded_n, stride, next_stride, + dev_scan); + CUDA_KERNEL_CHECK(); + + stride = next_stride; + next_stride >>= 1; // n/2, n/4, n/8, n/16, ... + } +} + +// Performs prefix-sum (aka scan) on idata, storing the result into odata. +void scan_wrapper(int n, const int* idata, int* odata) +{ + int padded_n = 1 << ilog2_ceil(n); + + // create two device arrays + int* dev_scan; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_scan), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMemcpy(dev_scan, idata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + cudaDeviceSynchronize(); + + bool using_timer = false; + if (!get_timer().gpu_timer_started) // added in order to call `scan` from other functions. + { + get_timer().start_timer(); + using_timer = true; } + + scan(n, kBLOCK_SIZE, dev_scan); + + if (using_timer) get_timer().end_timer(); + + CUDA_CHECK(cudaMemcpy(odata, dev_scan, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + cudaFree(dev_scan); // can't forget memory leaks! +} + +/** + * Performs stream compaction on idata, storing the result into odata. + * All zeroes are discarded. + * + * @param n The number of elements in idata. + * @param odata The array into which to store elements. + * @param idata The array of elements to compact. + * @returns The number of elements remaining after compaction. + */ +int compact(int n, int* odata, const int* idata) +{ + // TODO: these arrays are unnecessary. will optimize soon. + + // create device arrays + int* dev_idata; + int* dev_odata; + + int* dev_bools; + int* dev_indices; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_idata), sizeof(int) * n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_odata), sizeof(int) * n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_bools), sizeof(int) * n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_indices), sizeof(int) * n)); + + CUDA_CHECK(cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMemcpy(dev_bools, odata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + cudaDeviceSynchronize(); + + int* indices = new int[n]; // create cpu side indices array + int* bools = new int[n]; + + get_timer().start_timer(); + + int blocks = divup(n, kBLOCK_SIZE); + + // reuse dev_idata for bools + common::kernel_map_to_boolean<<>>(n, dev_idata, dev_bools); + + CUDA_CHECK(cudaMemcpy(bools, dev_bools, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + scan_wrapper(n, indices, bools); + + CUDA_CHECK(cudaMemcpy(dev_indices, indices, sizeof(int) * n, cudaMemcpyHostToDevice)); + + common::kernel_scatter<<>>(n, dev_bools, dev_indices, dev_idata, dev_odata); + + get_timer().end_timer(); + + CUDA_CHECK(cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + + return indices[n - 1] + bools[n - 1]; } +} // namespace stream_compaction::efficient diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h index 803cb4fe..4b469875 100644 --- a/stream_compaction/efficient.h +++ b/stream_compaction/efficient.h @@ -2,12 +2,19 @@ #include "common.h" -namespace StreamCompaction { - namespace Efficient { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace stream_compaction::efficient +{ +common::PerformanceTimer& get_timer(); - void scan(int n, int *odata, const int *idata); +__global__ void kernel_efficient_up_sweep(int padded_n, int stride, int prev_stride, int* scan); - int compact(int n, int *odata, const int *idata); - } -} +__global__ void kernel_efficient_down_sweep(int padded_n, int stride, + int next_stride, // next_stride == (stride / 2) + int* scan); + +void scan(int n, int block_size, int* dev_scan); + +void scan_wrapper(int n, const int* idata, int* odata); + +int compact(int n, const int* idata, int* odata); +} // namespace stream_compaction::efficient diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 43088769..aa41ff2e 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -3,23 +3,69 @@ #include "common.h" #include "naive.h" -namespace StreamCompaction { - namespace Naive { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - // TODO: __global__ - - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO - timer().endGpuTimer(); - } +namespace stream_compaction::naive +{ +using enum common::eTimerDevice; +using common::PerformanceTimer; + +PerformanceTimer& get_timer() +{ + static PerformanceTimer timer; + return timer; +} + +__global__ void kernel_hillis_steele_scan(int n, int stride, const int* idata, int* odata) +{ + int index = common::kernel_compute_global_index_1d(); + + if (index >= n) return; + + if (index < stride) odata[index] = idata[index]; + else odata[index] = idata[index - stride] + idata[index]; +} + +void scan(int n, int block_size, int*& dev_idata, int*& dev_odata) +{ + get_timer().start_timer(); + + int blocks = divup(n, block_size); + + for (int iter = 1; iter <= ilog2_ceil(n); ++iter) + { + int stride = 1 << (iter - 1); + kernel_hillis_steele_scan<<>>(n, stride, dev_idata, dev_odata); + CUDA_KERNEL_CHECK(); + + // ping-pong. latest data always ends up in `dev_idata` + std::swap(dev_idata, dev_odata); } + + // convert to an exclusive kernel + common::kernel_inclusive_to_exclusive<<>>(n, 0, dev_idata, dev_odata); + CUDA_KERNEL_CHECK(); + + get_timer().end_timer(); +} + +void scan_wrapper(int n, int block_size, const int* idata, int* odata) +{ + // create two device arrays to ping-pong between + int* dev_idata; + int* dev_odata; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_idata), sizeof(int) * n)); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_odata), sizeof(int) * n)); + + CUDA_CHECK(cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + cudaDeviceSynchronize(); + + scan(n, block_size, dev_idata, dev_odata); + + CUDA_CHECK(cudaMemcpy(odata, dev_odata, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + // free memory + cudaFree(dev_idata); + cudaFree(dev_odata); } +} // namespace stream_compaction::naive diff --git a/stream_compaction/naive.h b/stream_compaction/naive.h index 37dcb064..3883e70b 100644 --- a/stream_compaction/naive.h +++ b/stream_compaction/naive.h @@ -2,10 +2,15 @@ #include "common.h" -namespace StreamCompaction { - namespace Naive { - StreamCompaction::Common::PerformanceTimer& timer(); +namespace stream_compaction::naive +{ +common::PerformanceTimer& get_timer(); - void scan(int n, int *odata, const int *idata); - } -} +__global__ void kernel_hillis_steele_scan(int n, int stride, const int* idata, int* odata); + +// pass in references to pointers so that ping-pong propogates to original pointers +void scan(int n, int block_size, int*& dev_idata, int*& dev_odata); + +// copies host `idata` to device and copies device `dev_odata` back to host after +void scan_wrapper(int n, int block_size, const int* idata, int* odata); +} // namespace stream_compaction::naive diff --git a/stream_compaction/radix.cu b/stream_compaction/radix.cu new file mode 100644 index 00000000..9d79da75 --- /dev/null +++ b/stream_compaction/radix.cu @@ -0,0 +1,220 @@ +#include "radix.h" + +#include +#include + +#include "common.h" +#include "shared.h" + +namespace stream_compaction::radix +{ + +using enum common::eTimerDevice; +using common::PerformanceTimer; + +PerformanceTimer& get_timer() +{ + static PerformanceTimer timer; + return timer; +} + +__device__ __host__ int kernel_isolate_bit(int n, int target_bit) +{ return (n >> target_bit) & 1; } + +__global__ void kernel_split(int n, int target_bit, const int* idata, int* out_not_lsb) +{ + int index = common::kernel_compute_global_index_1d(); + + if (index >= n) return; + + out_not_lsb[index] = kernel_isolate_bit(idata[index], target_bit) ^ 1; +} + +__global__ void kernel_compute_scatter_indices(int n, const int target_bit, const int* scan, + const int* idata, int* indices) +{ + int index = common::kernel_compute_global_index_1d(); + + if (index >= n) return; + + __shared__ int total_falses; + if (threadIdx.x == 0) + total_falses = (kernel_isolate_bit(idata[n - 1], target_bit) ^ 1) + scan[n - 1]; + + __syncthreads(); // wait for total_falses + + // if value is 1, we shift right by total falses minus falses before current index + // if value is 0, we set to position based on how many other falses / 0s come before it + indices[index] = kernel_isolate_bit(idata[index], target_bit) + ? index + (total_falses - scan[index]) + : scan[index]; +} + +__global__ void kernel_scatter(int n, const int* indices, const int* idata, int* odata) +{ + int index = common::kernel_compute_global_index_1d(); + + if (index >= n) return; + + int address = indices[index]; + odata[address] = idata[index]; // Scatter the value to its new position +} + +void sort(int n, int max_bit_length, int block_size, int* dev_block_sums, int* dev_indices, + int* dev_idata, int* dev_odata) +{ + for (int target_bit = 0; target_bit < max_bit_length; target_bit++) + { + int blocks = divup(n, block_size); + + // Split data into 0s and 1s based on the target bit + kernel_split<<>>(n, target_bit, dev_idata, dev_odata); + + // Perform scan on the split results + shared::scan(n, block_size, dev_block_sums, dev_odata, dev_odata); + + // Scatter data based on the split results + kernel_compute_scatter_indices<<>>(n, target_bit, dev_odata, dev_idata, + dev_indices); + + kernel_scatter<<>>(n, dev_indices, dev_idata, dev_odata); + + // Swap buffers (ping-pong) + int* temp = dev_idata; + dev_idata = dev_odata; + dev_odata = temp; + } +} + +void sort_wrapper(int n, int max_bit_length, int block_size, const int* idata, int* odata) +{ + const int padded_n = 1 << ilog2_ceil(n); + const int block_sums = divup(padded_n, 2 * block_size); + + // Allocate device memory for input/output data and scan + int* dev_idata; + int* dev_odata; + int* dev_block_sums; + int* dev_indices; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_idata), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_odata), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_block_sums), sizeof(int) * block_sums)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_indices), sizeof(int) * n)); + + // Copy input data to device + CUDA_CHECK(cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + bool using_timer = false; + if (!get_timer().gpu_timer_started) + { + get_timer().start_timer(); + using_timer = true; + } + + sort(n, max_bit_length, block_size, dev_block_sums, dev_indices, dev_idata, dev_odata); + + if (using_timer) get_timer().end_timer(); + + // Copy sorted data back to host + CUDA_CHECK(cudaMemcpy(odata, dev_idata, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + // Free device memory + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_block_sums); + cudaFree(dev_indices); +} + +void sort_by_key(int n, int max_bit_length, int block_size, int* dev_block_sums, int* dev_indices, + int* dev_ikeys, int* dev_okeys, int* dev_ivalues, int* dev_ovalues) +{ + for (int target_bit = 0; target_bit < max_bit_length; target_bit++) + { + int blocks = divup(n, block_size); + + // Split data into 0s and 1s based on the target bit + kernel_split<<>>(n, target_bit, dev_ikeys, dev_okeys); + + // Perform scan on the split results + shared::scan(n, block_size, dev_block_sums, dev_okeys, dev_okeys); + + // Scatter data based on the split results + kernel_compute_scatter_indices<<>>(n, target_bit, dev_okeys, dev_ikeys, + dev_indices); + + kernel_scatter<<>>(n, dev_indices, dev_ikeys, dev_okeys); + kernel_scatter<<>>(n, dev_indices, dev_ivalues, dev_ovalues); + + // Swap buffers (ping-pong) + int* temp = dev_ikeys; + dev_ikeys = dev_okeys; + dev_okeys = temp; + + temp = dev_ivalues; + dev_ivalues = dev_ovalues; + dev_ovalues = temp; + } +} + +void sort_by_key_wrapper(int n, int max_bit_length, int block_size, const int* ikeys, + const int* ivalues, int* okeys, int* ovalues) +{ + const int padded_n = 1 << ilog2_ceil(n); + const int block_sums = divup(padded_n, 2 * block_size); + + // Allocate device memory for input/output data and scan + int* dev_ikeys; + int* dev_okeys; + int* dev_ivalues; + int* dev_ovalues; + + int* dev_block_sums; + int* dev_indices; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_ikeys), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_okeys), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_ivalues), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_ovalues), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_block_sums), sizeof(int) * block_sums)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_indices), sizeof(int) * n)); + + // Copy input data to device + CUDA_CHECK(cudaMemcpy(dev_ikeys, ikeys, sizeof(int) * n, cudaMemcpyHostToDevice)); + + CUDA_CHECK(cudaMemcpy(dev_ivalues, ivalues, sizeof(int) * n, cudaMemcpyHostToDevice)); + + bool using_timer = false; + if (!get_timer().gpu_timer_started) + { + get_timer().start_timer(); + using_timer = true; + } + + sort_by_key(n, max_bit_length, block_size, dev_block_sums, dev_indices, dev_ikeys, dev_okeys, + dev_ivalues, dev_ovalues); + + if (using_timer) get_timer().end_timer(); + + // Copy sorted data back to host + CUDA_CHECK(cudaMemcpy(okeys, dev_ikeys, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + CUDA_CHECK(cudaMemcpy(ovalues, dev_ivalues, sizeof(int) * n, cudaMemcpyDeviceToHost)); + + // Free device memory + cudaFree(dev_ikeys); + cudaFree(dev_okeys); + cudaFree(dev_ivalues); + cudaFree(dev_ovalues); + cudaFree(dev_block_sums); + cudaFree(dev_indices); +} +} // namespace stream_compaction::radix diff --git a/stream_compaction/radix.h b/stream_compaction/radix.h new file mode 100644 index 00000000..dba481e6 --- /dev/null +++ b/stream_compaction/radix.h @@ -0,0 +1,32 @@ +#pragma once + +#include "common.h" + +namespace stream_compaction +{ +namespace radix +{ +common::PerformanceTimer& get_timer(); + +__device__ __host__ int kernel_isolate_bit(int n, int target_bit); + +__global__ void kernel_split(int n, int target_bit, const int* idata, int* out_not_lsb); + +__global__ void kernel_compute_scatter_indices(int n, int target_bit, const int* scan, + const int* idata, int* out_indices); + +__global__ void kernel_scatter(int n, const int* addresses, const int* idata, int* odata); + +void sort(int n, int max_bit_length, int block_size, int* dev_block_sums, int* dev_indices, + int* dev_idata, int* dev_odata); + +void sort_wrapper(int n, int max_bit_length, int block_size, const int* idata, int* odata); + +void sort_by_key(int n, int max_bit_length, int block_size, int* dev_block_sums, int* dev_indices, + int* dev_ikeys, int* dev_okeys, int* dev_ivalues, int* dev_ovalues); + +void sort_by_key_wrapper(int n, int max_bit_length, int block_size, const int* ikeys, + const int* ivalues, int* okeys, int* ovalues); + +} // namespace radix +} // namespace stream_compaction diff --git a/stream_compaction/shared.cu b/stream_compaction/shared.cu new file mode 100644 index 00000000..e1991f8e --- /dev/null +++ b/stream_compaction/shared.cu @@ -0,0 +1,336 @@ +#include +#include +#include "common.h" +#include "shared.h" + +namespace stream_compaction::shared +{ + +using enum common::eTimerDevice; +using common::PerformanceTimer; + +PerformanceTimer& shared::get_timer() +{ + static PerformanceTimer timer; + return timer; +} + +__device__ __host__ int kernel_offset(int idx) +{ return idx + CONFLICT_FREE_OFFSET(idx); } + +__global__ void kernel_scan_intra_block_shared(int padded_n, const int* idata, int* out_block_sums, + int* odata) +{ + extern __shared__ int mat[]; + + const int tile_size = static_cast(blockDim.x * 2); + + const int tid = static_cast(threadIdx.x); + + int block_offset = static_cast((blockIdx.x * blockDim.x) * 2); + int thread_offset = 2 * tid; // first index this thread is responsible for + + int global_thread_idx = block_offset + thread_offset; + + // global memory is read from in coalesced fashion + // ensure some threads do not return early without zero-padding the shared matrix + mat[kernel_offset(thread_offset)] = (global_thread_idx < padded_n) + ? idata[block_offset + thread_offset] + : 0; + mat[kernel_offset(thread_offset + 1)] = (global_thread_idx + 1 < padded_n) + ? idata[block_offset + thread_offset + 1] + : 0; + + // which stride each child is reponsible for -- constant per thread + // in reality, it is one stride higher than expected, but that's due to -1 + const int stride_idx_first_child = thread_offset + 1; + const int stride_idx_second_child = thread_offset + 2; + + int stride = 1; // 1, 2, 4, 8, 16, 32, ... tileSize + // activeThreads: n/2, n/4, n/8, ... 1 + for (int active_threads = tile_size >> 1; active_threads > 0; active_threads >>= 1) + { + __syncthreads(); + + if (tid < active_threads) + { + int first_idx = stride_idx_first_child * stride - 1; + int second_idx = stride_idx_second_child * stride - 1; + + mat[kernel_offset(second_idx)] += mat[kernel_offset(first_idx)]; + } + stride *= 2; + } + + __syncthreads(); + + if (tid == 0) + { + out_block_sums[blockIdx.x] + = mat[kernel_offset(tile_size - 1)]; // write accumulated val of block + mat[kernel_offset(tile_size - 1)] = 0; // clear last element + } + + for (int active_threads = 1; active_threads < tile_size; active_threads <<= 1) + { + stride >>= 1; // STRIDE ended at tileSize + __syncthreads(); + + if (tid < active_threads) + { + int first_idx = stride_idx_first_child * stride - 1; + int second_idx = stride_idx_second_child * stride - 1; + + first_idx = kernel_offset(first_idx); + second_idx = kernel_offset(second_idx); + int temp = mat[first_idx]; + mat[first_idx] = mat[second_idx]; + mat[second_idx] += temp; + } + } + + __syncthreads(); // this time the last `__syncthreads()` wasn't called + + if (global_thread_idx < padded_n) + odata[block_offset + thread_offset] = mat[kernel_offset(thread_offset)]; + if (global_thread_idx + 1 < padded_n) + odata[block_offset + thread_offset + 1] = mat[kernel_offset(thread_offset + 1)]; +} + +__global__ void kernel_add_block_sums(int n, const int* in_block_sums, int* odata) +{ + __shared__ int block_offset; + + if (threadIdx.x == 0) block_offset = in_block_sums[blockIdx.x]; + + __syncthreads(); + + int index = common::kernel_compute_global_index_1d(); + + if (index >= n) return; // safe to return now + + odata[index] += block_offset; +} + +/* + the inner operation of scan without timers and allocation. + note: dev_scan should be pre-allocated to the padded power of two size +*/ +void scan(int n, int block_size, int* dev_block_sums, const int* dev_idata, int* dev_odata) +{ + int padded_n = 1 << ilog2_ceil(n); // pad to nearest power of 2 + + const int block_span = block_size * 2; + // perform scan on the block level + int num_blocks = divup(padded_n, block_span); + + // numBlocks, numThreads, shared mem size + kernel_scan_intra_block_shared<<>>(padded_n, dev_idata, + dev_block_sums, + dev_odata); + CUDA_KERNEL_CHECK(); + + if (num_blocks > 1) + { + // Allocate temporary buffer for recursive scan of block sums + int* dev_new_o_data; + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_new_o_data), sizeof(int) * num_blocks)); + + // Recursively scan the block sums + scan(num_blocks, block_size, dev_block_sums, dev_block_sums, dev_new_o_data); + + // Add the recursively scanned block sums to the output + kernel_add_block_sums<<>>(padded_n, dev_new_o_data, dev_odata); + + // Free the temporary buffer + cudaFree(dev_new_o_data); + } +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan_wrapper(int n, int* odata, const int* idata) +{ + int padded_n = 1 << ilog2_ceil(n); + + int total_blocks = divup(padded_n, 2 * kBLOCK_SIZE); + + // create two device arrays + int* dev_idata; + int* dev_odata; + int* dev_block_sums; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_idata), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_odata), sizeof(int) * padded_n)); + + // create new array to store total sum of each block + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_block_sums), sizeof(int) * total_blocks)); + + CUDA_CHECK(cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + cudaDeviceSynchronize(); + + bool using_timer = false; + if (!get_timer().gpu_timer_started) // added in order to call `scan` from other functions. + { + get_timer().start_timer(); + using_timer = true; + } + + scan(n, kBLOCK_SIZE, dev_block_sums, dev_idata, dev_odata); + + if (using_timer) get_timer().end_timer(); + + CUDA_CHECK(cudaMemcpy(odata, dev_odata, sizeof(int) * n, + cudaMemcpyDeviceToHost)); // only copy n elements + + cudaFree(dev_idata); // can't forget memory leaks! + cudaFree(dev_odata); + cudaFree(dev_block_sums); +} + +int compact(int n, int block_size, const int* dev_idata, int* dev_bools, int* dev_indices, + int* dev_block_sums, int* dev_odata) +{ + int blocks = divup(n, block_size); + + common::kernel_map_to_boolean<<>>(n, dev_idata, dev_bools); + + scan(n, block_size, dev_block_sums, dev_bools, dev_indices); + + common::kernel_scatter<<>>(n, dev_bools, dev_indices, dev_idata, dev_odata); + + int last_index; + int last_bool; + + CUDA_CHECK(cudaMemcpy(&last_index, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&last_bool, &dev_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost)); + + return last_index + last_bool; +} + +/** + * Performs stream compaction on idata, storing the result into odata. + * Returns the number of surviving elements (i.e. non-zero). + */ +int compact_wrapper(int n, const int* idata, int* odata) +{ + int padded_n = 1 << ilog2_ceil(n); // pad to nearest power of 2 + int total_blocks = divup(padded_n, 2 * kBLOCK_SIZE); // for scan block sums + + // Allocate device arrays + int* dev_idata; + int* dev_odata; + int* dev_bools; + int* dev_indices; + int* dev_block_sums; + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_idata), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_odata), sizeof(int) * padded_n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_bools), sizeof(int) * n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_indices), sizeof(int) * n)); + + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_block_sums), sizeof(int) * total_blocks)); + + // Copy input data from host to device + CUDA_CHECK(cudaMemcpy(dev_idata, idata, sizeof(int) * n, cudaMemcpyHostToDevice)); + + // Run the compaction and time it + bool using_timer = false; + if (!get_timer().gpu_timer_started) + { + get_timer().start_timer(); + using_timer = true; + } + + int compact_count = compact(n, kBLOCK_SIZE, dev_idata, dev_bools, dev_indices, dev_block_sums, + dev_odata); + + if (using_timer) get_timer().end_timer(); + + // Copy the compacted result back to host; note that compactCount elements are valid + CUDA_CHECK(cudaMemcpy(odata, dev_odata, sizeof(int) * compact_count, cudaMemcpyDeviceToHost)); + + // Free device memory + cudaFree(dev_idata); + cudaFree(dev_odata); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_block_sums); + + return compact_count; +} + +int compact_by_key(int n, int block_size, const int* dev_idata, const int* dev_ivalues, + int* dev_indices, int* dev_block_sums, int* dev_bools, int* dev_odata, + int* dev_ovalues) +{ + int blocks = divup(n, block_size); + + common::kernel_map_to_boolean<<>>(n, dev_idata, dev_bools); + + scan(n, block_size, dev_block_sums, dev_bools, dev_indices); + + common::kernel_scatter<<>>(n, dev_bools, dev_indices, dev_idata, dev_odata); + + cudaDeviceSynchronize(); + + common::kernel_scatter<<>>(n, dev_bools, dev_indices, dev_ivalues, + dev_ovalues); + + int last_index; + int last_bool; + + CUDA_CHECK(cudaMemcpy(&last_index, &dev_indices[n - 1], sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&last_bool, &dev_bools[n - 1], sizeof(int), cudaMemcpyDeviceToHost)); + + return last_index + last_bool; +} + +int compact_by_key_wrapper(int n, const int* ikeys, const int* ivalues, int* okeys, int* ovalues) +{ + int *dev_ivalues, *dev_ovalues; + int *dev_ikeys, *dev_okeys, *dev_bools, *dev_indices, *dev_block_sums; + + // Allocate device memory + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_ivalues), n * sizeof(int))); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_ovalues), n * sizeof(int))); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_ikeys), n * sizeof(int))); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_okeys), n * sizeof(int))); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_bools), n * sizeof(int))); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_indices), n * sizeof(int))); + + int blocks = divup(n, kBLOCK_SIZE); + CUDA_CHECK(cudaMalloc(reinterpret_cast(&dev_block_sums), blocks * sizeof(int))); + + // Copy input data from host to device. + CUDA_CHECK(cudaMemcpy(dev_ivalues, ivalues, n * sizeof(int), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(dev_ikeys, ikeys, n * sizeof(int), cudaMemcpyHostToDevice)); + + // Call the templated device function from shared.h. + // (This kernel launches both key and value scatter) + int count = compact_by_key(n, kBLOCK_SIZE, dev_bools, dev_okeys, dev_ivalues, dev_ovalues, + dev_ikeys, dev_block_sums, dev_indices); + + // Copy compacted results back to host. + CUDA_CHECK(cudaMemcpy(ovalues, dev_ovalues, count * sizeof(int), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(okeys, dev_okeys, count * sizeof(int), cudaMemcpyDeviceToHost)); + + // Free device memory. + cudaFree(dev_ivalues); + cudaFree(dev_ovalues); + cudaFree(dev_ikeys); + cudaFree(dev_okeys); + cudaFree(dev_bools); + cudaFree(dev_indices); + cudaFree(dev_block_sums); + + return count; +} +} // namespace stream_compaction::shared diff --git a/stream_compaction/shared.h b/stream_compaction/shared.h new file mode 100644 index 00000000..2e21dde7 --- /dev/null +++ b/stream_compaction/shared.h @@ -0,0 +1,40 @@ +#pragma once + +#include "common.h" + +#define NUM_BANKS 32 +#define LOG_NUM_BANKS 5 +#define CONFLICT_FREE_OFFSET(n) ((n) >> NUM_BANKS + (n) >> (2 * LOG_NUM_BANKS)) + +namespace stream_compaction::shared +{ + +common::PerformanceTimer& get_timer(); + +__device__ __host__ int kernel_offset(int idx); + +__global__ void kernel_scan_intra_block_shared(int padded_n, const int* idata, int* out_block_sums, + int* odata); + +__global__ void kernel_add_block_sums(int n, const int* in_block_sums, int* odata); + +void scan(int n, int block_size, int* dev_block_sums, const int* dev_idata, int* dev_odata); + +void scan_wrapper(int n, int* odata, const int* idata); + +int compact(int n, int block_size, const int* dev_idata, int* dev_bools, int* dev_indices, + int* dev_block_sums, int* dev_odata); + +int compact_wrapper(int n, const int* idata, int* odata); + +int compact_by_key(int n, int block_size, const int* dev_idata, const int* dev_ivalues, + int* dev_indices, int* dev_block_sums, int* dev_bools, int* dev_odata, + int* dev_ovalues); + +// Host wrapper for compact_by_key. It accepts host arrays for values and keys. +// For example, ivalues and ikeys are input arrays, +// ovalues and okeys will receive the compacted results. +// The function returns the number of surviving (compacted) elements. +int compact_by_key_wrapper(int n, const int* ikeys, const int* ivalues, int* okeys, int* ovalues); + +} // namespace stream_compaction::shared diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu deleted file mode 100644 index 1def45e7..00000000 --- a/stream_compaction/thrust.cu +++ /dev/null @@ -1,28 +0,0 @@ -#include -#include -#include -#include -#include -#include "common.h" -#include "thrust.h" - -namespace StreamCompaction { - namespace Thrust { - using StreamCompaction::Common::PerformanceTimer; - PerformanceTimer& timer() - { - static PerformanceTimer timer; - return timer; - } - /** - * Performs prefix-sum (aka scan) on idata, storing the result into odata. - */ - void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); - // TODO use `thrust::exclusive_scan` - // example: for device_vectors dv_in and dv_out: - // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); - timer().endGpuTimer(); - } - } -} diff --git a/stream_compaction/thrust.h b/stream_compaction/thrust.h deleted file mode 100644 index fe98206b..00000000 --- a/stream_compaction/thrust.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once - -#include "common.h" - -namespace StreamCompaction { - namespace Thrust { - StreamCompaction::Common::PerformanceTimer& timer(); - - void scan(int n, int *odata, const int *idata); - } -} diff --git a/stream_compaction/thrust_wrapper.cu b/stream_compaction/thrust_wrapper.cu new file mode 100644 index 00000000..b10ac57b --- /dev/null +++ b/stream_compaction/thrust_wrapper.cu @@ -0,0 +1,122 @@ +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "common.h" +#include "thrust_wrapper.h" + +namespace stream_compaction::thrust_wrapper +{ +using enum common::eTimerDevice; +using common::PerformanceTimer; + +using thrust::host_vector; + +PerformanceTimer& get_timer() +{ + static PerformanceTimer timer; + return timer; +} + +/** + * Performs prefix-sum (aka scan) on idata, storing the result into odata. + */ +void scan(int n, const int* idata, int* odata) +{ + // Copy data from host to device + thrust::host_vector host_idata(idata, idata + n); // thrust host vector + thrust::device_vector dev_idata = host_idata; // built-in assignment conversion + thrust::device_vector dev_odata(n); // for output + + get_timer().start_timer(); + + thrust::exclusive_scan(dev_idata.begin(), dev_idata.end(), dev_odata.begin()); + + get_timer().end_timer(); + + // copy result back to host + thrust::copy(dev_odata.begin(), dev_odata.end(), odata); +} + +void radix_sort(int n, const int* idata, int* odata) +{ + thrust::device_vector dev_copy(idata, idata + n); + + bool using_timer = false; + if (!get_timer().gpu_timer_started) + { + get_timer().start_timer(); + using_timer = true; + } + + thrust::sort(dev_copy.begin(), dev_copy.end()); + + if (using_timer) get_timer().end_timer(); + + thrust::copy(dev_copy.begin(), dev_copy.end(), odata); +} + +void radix_sort_by_key(int n, const int* ikeys, const int* ivalues, int* okeys, int* ovalues) +{ + // Wrap raw pointers with Thrust device pointers + thrust::device_vector dev_ikeys(ikeys, ikeys + n); + thrust::device_vector dev_ivalues(ivalues, ivalues + n); + + bool using_timer = false; + if (!get_timer().gpu_timer_started) + { + get_timer().start_timer(); + using_timer = true; + } + + // Sort keys and reorder values accordingly + thrust::sort_by_key(dev_ikeys.begin(), dev_ikeys.end(), dev_ivalues.begin()); + + if (using_timer) get_timer().end_timer(); + + // Copy sorted keys and values back to host + thrust::copy(dev_ikeys.begin(), dev_ikeys.end(), okeys); + thrust::copy(dev_ivalues.begin(), dev_ivalues.end(), ovalues); +} + +/** + * Stream compaction by key using Thrust. + * Given n elements in in_keys and in_values, copies each (key,value) pair + * for which the key is nonzero into out_keys and out_values. + * Returns the number of surviving elements. + */ +int compact_by_key(int n, const int* ikeys, const float* ivalues, int* okeys, float* ovalues) +{ + // Wrap raw input arrays into Thrust device vectors. + thrust::device_vector dev_ikeys(ikeys, ikeys + n); + thrust::device_vector dev_ivalues(ivalues, ivalues + n); + + // Create a zipped iterator over (key, value) + auto zipped_begin = thrust::make_zip_iterator( + thrust::make_tuple(dev_ikeys.begin(), dev_ivalues.begin())); + auto zipped_end = thrust::make_zip_iterator( + thrust::make_tuple(dev_ikeys.end(), dev_ivalues.end())); + + // Call remove_if: it shifts surviving elements to the front. + // Remove pairs if key == 0. + auto new_end = thrust::remove_if(zipped_begin, zipped_end, + [] __device__(const thrust::tuple& tup) + { return thrust::get<0>(tup) == 0; }); + + // Compute the new count. + int count = static_cast(thrust::get<0>(new_end - zipped_begin)); + + // Copy the surviving keys and values back to host memory. + thrust::copy(dev_ikeys.begin(), dev_ikeys.begin() + count, okeys); + thrust::copy(dev_ivalues.begin(), dev_ivalues.begin() + count, ovalues); + + return count; +} + +} // namespace stream_compaction::thrust_wrapper diff --git a/stream_compaction/thrust_wrapper.h b/stream_compaction/thrust_wrapper.h new file mode 100644 index 00000000..b8750cae --- /dev/null +++ b/stream_compaction/thrust_wrapper.h @@ -0,0 +1,23 @@ +#pragma once + +#include "common.h" + +namespace stream_compaction::thrust_wrapper +{ +struct IsNonZero +{ + __host__ __device__ bool operator()(const int x) const + { return x != 0; } +}; + +common::PerformanceTimer& get_timer(); + +void scan(int n, const int* idata, int* odata); + +void radix_sort(int n, const int* idata, int* odata); + +void radix_sort_by_key(int n, const int* ikeys, const int* ivalues, int* okeys, int* ovalues); + +int compact_by_key(int n, const int* ikeys, const float* ivalues, int* okeys, float* ovalues); + +} // namespace stream_compaction::thrust_wrapper diff --git a/tests/main.cpp b/tests/main.cpp new file mode 100644 index 00000000..c2c2c5d1 --- /dev/null +++ b/tests/main.cpp @@ -0,0 +1,43 @@ +#include + +#include +#include + +void get_device_properties() +{ + int device_count; + cudaError_t err = cudaGetDeviceCount(&device_count); + if (err != cudaSuccess) + fprintf(stderr, "Failed to get device count: %s\n", cudaGetErrorString(err)); + + if (device_count == 0) fprintf(stderr, "No CUDA-capable devices found.\n"); + + for (int i = 0; i < device_count; ++i) + { + cudaDeviceProp device_prop{}; + err = cudaGetDeviceProperties(&device_prop, i); + if (err != cudaSuccess) + { + fprintf(stderr, "Failed to get properties for device %i: %s\n", i, + cudaGetErrorString(err)); + continue; + } + + printf("DEVICE %i PROPERTIES:\n", i); + printf("Name: %s\n", device_prop.name); + printf("Total Global Memory: %zu bytes\n", device_prop.totalGlobalMem); + printf("Compute Capability: %i.%i\n", device_prop.major, device_prop.minor); + printf("Number of Multiprocessors: %i\n", device_prop.multiProcessorCount); + printf("shared Memory Per Block: %zu bytes\n", device_prop.sharedMemPerBlock); + printf("Registers Per Block: %i\n", device_prop.regsPerBlock); + printf("Warp Size: %i\n\n", device_prop.warpSize); + } +} + +int main(int argc, char** argv) +{ + get_device_properties(); + + testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/tests/radix_sort_tests.cpp b/tests/radix_sort_tests.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/scan_tests.cpp b/tests/scan_tests.cpp new file mode 100644 index 00000000..e6884dc9 --- /dev/null +++ b/tests/scan_tests.cpp @@ -0,0 +1,60 @@ +#include + +#include "test_utils.h" + +#include +#include + +using namespace stream_compaction; +using enum common::eTimerDevice; + +class ScanTest : public testing::TestWithParam +{ +protected: + void SetUp() override + { + int n = GetParam(); + _source.resize(n, -1); + _expected.resize(n, -1); + _actual.resize(n, -1); + fill_container_random(_source, n); + } + + std::vector _source{}; + std::vector _expected{}; + std::vector _actual{}; +}; + +constexpr int kNUM_POT = 24; +constexpr std::array kPOT_VALUES = [] +{ + std::array arr{}; + for (int i = 0; i < kNUM_POT; ++i) + arr[i] = 1 << i; + return arr; +}(); + +constexpr int kNUM_NPOT = kNUM_POT - 2; // '1' and '2' will not have an NPOT test case +constexpr std::array kNPOT_VALUES = [] +{ + std::array arr{}; + for (int i = 0, pot = 2; i < kNUM_NPOT; ++i, ++pot) + arr[i] = (1 << pot) - 3; + return arr; +}(); + +INSTANTIATE_TEST_SUITE_P(PowersOfTwo, ScanTest, testing::ValuesIn(kPOT_VALUES), + testing::PrintToStringParamName()); + +INSTANTIATE_TEST_SUITE_P(NonPowersOfTwo, ScanTest, testing::ValuesIn(kNPOT_VALUES), + testing::PrintToStringParamName()); + +TEST_P(ScanTest, naiveScan) +{ + cpu::scan(GetParam(), _source.data(), _expected.data()); + cpu::get_timer().flush(); + naive::scan_wrapper(GetParam(), kBLOCK_SIZE, _source.data(), _actual.data()); + naive::get_timer().flush(); + + ASSERT_EQ(_expected, _actual); +} diff --git a/tests/stream_compaction_tests.cpp b/tests/stream_compaction_tests.cpp new file mode 100644 index 00000000..e69de29b diff --git a/tests/test_utils.h b/tests/test_utils.h new file mode 100644 index 00000000..c89e9e13 --- /dev/null +++ b/tests/test_utils.h @@ -0,0 +1,45 @@ +#pragma once + +#include +#include + +constexpr int kSIZE = 1 << 24; +constexpr int kNPOT = kSIZE - 3; // non-power-of-two + +#define POW2(x) (1 << (x)) + +template +concept ContainerIntegral + = std::ranges::contiguous_range // is contiguous + && std::integral> // has element of integral type + && std::is_standard_layout_v> // has C-compatible memory layout + && requires(T t) { + { t.data() } -> std::same_as*>; // has `data()` method + { t.size() } -> std::convertible_to; // has `size()` method + }; + +template +void fill_container_random(T& ctn, int max_val) +{ + using Element = std::ranges::range_value_t; + + std::random_device rd; + std::mt19937 gen(rd()); // initialize Mersenne Twister engine + std::uniform_int_distribution distrib(0, max_val); + + for (int i = 0; i < ctn.size(); ++i) + ctn[i] = distrib(gen); +} + +template +void print_container(const T& ctn, bool abridged = true) +{ + int n = static_cast(ctn.size()); + int max_size = abridged ? std::min(n, 16) : n; + + std::cout << '\t' << "[ "; + for (int i = 0; i < max_size; ++i) + printf("%i ", static_cast(ctn[i])); + if (abridged && max_size < n) std::cout << "..."; + printf(" ] - count: %i\n", n); +} diff --git a/tests/testing_helpers.h b/tests/testing_helpers.h new file mode 100644 index 00000000..1d7fc187 --- /dev/null +++ b/tests/testing_helpers.h @@ -0,0 +1,87 @@ +#pragma once + +#include +#include +#include +#include + +constexpr int kSIZE = 1 << 24; +constexpr int kNPOT = kSIZE - 3; // Non-Power-Of-Two + +constexpr char kPINK[] = "\033[1;35m"; +constexpr char kRED[] = "\033[1;31m"; +constexpr char kGREEN[] = "\033[1;32m"; +constexpr char kRESET[] = "\033[0m"; + +template +inline bool cmp_arrays(int n, T* a, T* b) +{ + for (int i = 0; i < n; ++i) + { + if (a[i] != b[i]) + { + printf(" a[%i] = %i, b[%i] = %i\n", i, static_cast(a[i]), i, + static_cast(b[i])); + + return false; + } + } + return true; +} + +inline void print_desc(const char* desc) +{ std::cout << kPINK << "=== " << desc << " ===" << kRESET << std::endl; } + +template +inline void print_cmp_result(int n, T* a, T* b) +{ + if (cmp_arrays(n, a, b)) std::cout << kRED << "FAILED"; + else std::cout << kGREEN << "PASSED"; + std::cout << kRESET << std::endl; +} + +inline void zero_array(int n, int* a) +{ + for (int i = 0; i < n; ++i) + a[i] = 0; +} + +inline void ones_array(int n, int* a) +{ + for (int i = 0; i < n; ++i) + a[i] = 1; +} + +template +inline void gen_array(int n, T* a, int max_val) +{ + std::random_device rd; + std::mt19937 gen(rd()); // initialize Mersenne Twister engine + std::uniform_int_distribution distrib(0, max_val); + + for (int i = 0; i < n; ++i) + a[i] = distrib(gen); +} + +template +inline void gen_consecutive_array(int n, T* a) +{ + for (int i = 0; i < n; ++i) + a[i] = static_cast(i); +} + +template +inline void copy_array(int n, const T* a, T* out_copy) +{ memcpy(out_copy, a, n * sizeof(T)); } + +template +inline void print_array(int n, T* a, bool abridged = true) +{ + int max_size = abridged ? std::min(n, 16) : n; + + std::cout << '\t' << "[ "; + for (int i = 0; i < max_size; ++i) + printf("%i ", static_cast(a[i])); + if (abridged) std::cout << "..."; + printf(" ] - count: %i\n", n); +}