Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
368 commits
Select commit Hold shift + click to select a range
dde406d
test with graphs
caugonnet Aug 28, 2025
7563014
parametrized tests
caugonnet Aug 28, 2025
ba4e9c3
Merge branch 'main' into stf_c_api
caugonnet Aug 28, 2025
b094c27
test that we get a stream in graph_task when capturing
caugonnet Aug 28, 2025
222c216
Save WIP: add a mockup of FHE example, which needs a like_empty method
caugonnet Aug 28, 2025
b04cebf
Implement like_empty
caugonnet Aug 28, 2025
9ed5ace
More comprehensive FHE test
caugonnet Aug 28, 2025
e27ef5b
test fhe with stf decorator
caugonnet Aug 28, 2025
d0f915e
Merge branch 'main' into stf_c_api
caugonnet Aug 28, 2025
6963ec0
fix merge error
caugonnet Aug 28, 2025
06fab11
Appropriate checks
caugonnet Aug 29, 2025
2fc802e
Add missing ;
caugonnet Aug 29, 2025
a43db62
- Make it possible to create a borrowed context from a handle
caugonnet Aug 29, 2025
9c07679
invert ctx and exec place in the decorator
caugonnet Aug 29, 2025
947bbcc
fix decorator api
caugonnet Aug 29, 2025
22b2d19
Add ciphertext.like_empty()
caugonnet Aug 29, 2025
66bcde3
Removing prints
caugonnet Aug 29, 2025
84534c8
do not import specific methods
caugonnet Aug 29, 2025
acf0cce
fix decorator api
caugonnet Aug 29, 2025
6a6e84f
Add a pytorch experiment
Aug 29, 2025
297a69b
more pytorch test
Aug 29, 2025
533ca5a
better interop with pytorch
Aug 29, 2025
9aa749f
remove useless pass
Aug 29, 2025
b11aa4b
tensor_arguments
Aug 29, 2025
0af151f
simpler code
Aug 29, 2025
746d308
pre-commit hooks
caugonnet Aug 29, 2025
d9195f5
try to remove dependency on torch and have adapters (WIP)
caugonnet Aug 31, 2025
f5ac828
remove unused code
caugonnet Aug 31, 2025
454a5da
cleanups
caugonnet Aug 31, 2025
ccfbb6b
fix numba adapter
caugonnet Aug 31, 2025
c6e7c07
skip torch test if torch is not available
caugonnet Aug 31, 2025
842a651
add dot vertex even in the low level api
caugonnet Aug 31, 2025
00c649c
fix types
caugonnet Aug 31, 2025
b0fc18d
pre-commit hooks
caugonnet Aug 31, 2025
3b257df
Merge branch 'main' into stf_c_api
caugonnet Aug 31, 2025
04cc07a
dot add_vertex is done in start() now
caugonnet Aug 31, 2025
bce25b8
Start to implement the FDTD example in pytorch
caugonnet Sep 1, 2025
d9c5f11
Start to port in STF version of pytorch
caugonnet Sep 1, 2025
70fa5d8
Adapt the FDTD example to use STF constructs and add methods to initi…
caugonnet Sep 1, 2025
5587a8d
format issue
caugonnet Sep 1, 2025
5ea5243
charset issue
caugonnet Sep 1, 2025
f7fbd34
rank agnostic method to init
caugonnet Sep 1, 2025
aec2d71
use .zero_() to blank fields
caugonnet Sep 1, 2025
eb71880
print values
caugonnet Sep 1, 2025
aaf6ec6
Experiment to display output as an image
caugonnet Sep 1, 2025
ae4c6d6
Use non blocking API
caugonnet Sep 2, 2025
9029fda
remove dead code
caugonnet Sep 2, 2025
ce7a33b
remove dead code
caugonnet Sep 2, 2025
cbde742
minor cleanup
caugonnet Sep 2, 2025
1936db6
Merge branch 'main' into stf_c_api
caugonnet Sep 2, 2025
c91e814
clang-format
caugonnet Sep 2, 2025
3fe6178
Add a C library for CUDASTF (to be used in the python bindings)
caugonnet Sep 2, 2025
666bd07
Merge branch 'main' into stf_c_lib
caugonnet Sep 2, 2025
522b630
remove dead code
caugonnet Sep 2, 2025
4315314
do define and use CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING
caugonnet Sep 2, 2025
48627aa
Add CUDASTF C lib to tests
caugonnet Sep 2, 2025
410aadd
Merge branch 'main' into stf_c_lib
caugonnet Sep 2, 2025
c87cdaa
Add missing headers
caugonnet Sep 2, 2025
02a9eb6
use snake_case
caugonnet Sep 2, 2025
232133b
Do define CCCL_C_EXPERIMENTAL=1
caugonnet Sep 2, 2025
b60eb6b
Do not do redundant tests
caugonnet Sep 2, 2025
c4c99f0
Add a project to ci/inspect_changes.sh
caugonnet Sep 2, 2025
2f5925b
missing changes in previous commit
caugonnet Sep 2, 2025
3417075
add presets
caugonnet Sep 2, 2025
8c05034
Add override matrix
alliepiper Sep 2, 2025
20faa8f
Properly define structs with a typedef and remove superfluous struct …
caugonnet Sep 3, 2025
d378f5a
Merge branch 'main' into stf_c_lib
caugonnet Sep 3, 2025
8c5e760
fix previous merge
caugonnet Sep 3, 2025
78dc197
Change tensor_arguments to return an element instead of a tuple of on…
caugonnet Sep 3, 2025
2eb2ace
Remove intermediate structures and use opaque pointers instead
caugonnet Sep 3, 2025
6557067
Automatically generated documentation
caugonnet Sep 3, 2025
60266ff
Better implementation of the help to convert C places to the C++ API,…
caugonnet Sep 3, 2025
59f1983
Tell where to find cudax, and remove unnecessary libs
caugonnet Sep 3, 2025
c7fa9e6
Merge branch 'main' into stf_c_lib
caugonnet Sep 3, 2025
97dd6f7
CCCL_ENABLE_C enables c/parallel, CCCL_ENABLE_C_EXPERIMENTAL_STF enab…
caugonnet Sep 3, 2025
1610f0b
Remove unnecessary definitions
caugonnet Sep 3, 2025
4383eaf
Merge branch 'main' into stf_c_lib
caugonnet Sep 3, 2025
101fd0b
Merge branch 'main' into stf_c_lib
caugonnet Sep 4, 2025
4db210b
Merge branch 'main' into stf_c_lib
caugonnet Sep 5, 2025
90a8d20
use more consistent option names
caugonnet Sep 5, 2025
f2d7528
Merge branch 'main' into stf_c_lib
caugonnet Sep 9, 2025
ac667ca
Do not use [[maybe_unused]] for the C lib header because this is only…
caugonnet Sep 9, 2025
5bf62b3
Return an error code in stf_cuda_kernel_add_desc rather than use asse…
caugonnet Sep 9, 2025
c0a54f1
clang-format
caugonnet Sep 9, 2025
4573f9f
Merge branch 'main' into stf_c_lib
caugonnet Sep 9, 2025
abc58d8
Merge branch 'main' into stf_c_api
caugonnet Sep 9, 2025
af43da5
Merge stf_c_lib: Update c/ directory with complete C library implemen…
caugonnet Sep 9, 2025
c00c915
Revert Python linting changes
caugonnet Sep 9, 2025
cdd0d85
Fix Python CMakeLists.txt: Update C library feature flags
caugonnet Sep 9, 2025
afda29f
Fix Python build: Add missing CCCL_ENABLE_C master flag
caugonnet Sep 9, 2025
4f1f079
Complete STF C library configuration: Enable all C library features a…
caugonnet Sep 9, 2025
ccfc41d
Remove obsolete CCCL_ENABLE_C flag
caugonnet Sep 9, 2025
e4b8277
Update CMake configuration to match stf_c_lib structure
caugonnet Sep 9, 2025
6931fa8
Optimize Python build: Remove unnecessary C parallel library
caugonnet Sep 9, 2025
a1a1139
clang-format
caugonnet Sep 9, 2025
a3071f7
Merge branch 'stf_c_lib' into stf_c_api
caugonnet Sep 9, 2025
ecd9f4e
fix pytorch example
caugonnet Sep 9, 2025
4b2ae75
use ascii symbols
caugonnet Sep 9, 2025
5881081
Merge branch 'main' into stf_c_api
caugonnet Sep 9, 2025
4eef870
Merge branch 'main' into stf_c_api
caugonnet Sep 10, 2025
dcb3d39
Cleanup some changes in the infra from a previous merge
caugonnet Sep 10, 2025
1284eb2
Implement logical_data_empty logical_data_zeros, and logical_data_full
caugonnet Sep 10, 2025
0514f29
short names for torch.cuda
caugonnet Sep 10, 2025
5e9b4d5
Introduce pytorch_task
caugonnet Sep 10, 2025
53a4542
clang-format and some minor comment
caugonnet Sep 10, 2025
989f58b
Merge branch 'main' into stf_c_api
caugonnet Sep 17, 2025
93055c0
Merge branch 'main' into stf_c_api
caugonnet Sep 23, 2025
218fda2
make sure stf python tests are wrapped into functions so that pytest …
caugonnet Sep 25, 2025
1f97482
fix the return values of pytests
caugonnet Sep 25, 2025
1e482a4
Merge branch 'main' into stf_c_api
caugonnet Sep 25, 2025
7a58d68
Start to experiment with Warp
caugonnet Sep 25, 2025
9fb1c26
logical_data in python are now initialized with a data place, and the…
caugonnet Sep 25, 2025
5c1d50e
Save WIP: add access modes
caugonnet Sep 25, 2025
9f31b1e
cleanups
caugonnet Sep 25, 2025
c0bb070
Save WIP
caugonnet Sep 25, 2025
7094dd5
Merge branch 'main' into stf_c_api
caugonnet Oct 7, 2025
76d78b4
Adopt to new python hierarchy
caugonnet Oct 8, 2025
e03b062
Merge branch 'main' into stf_c_api
caugonnet Oct 8, 2025
0c11b6a
fix errors in a previous merge
caugonnet Oct 8, 2025
f6c50e1
cuda.cccl.experimental.stf => cuda.stf
caugonnet Oct 8, 2025
efea184
Misc stf python tests improvements
caugonnet Oct 8, 2025
c0d3592
Save WIP on this warp example
caugonnet Oct 8, 2025
eba61eb
Add sanity checks to test the is_void_interface() API
caugonnet Oct 8, 2025
e17c261
support tokens in python
caugonnet Oct 8, 2025
ec9c955
remove debug print
caugonnet Oct 8, 2025
52f4823
python cholesky with cupy
caugonnet Oct 8, 2025
5a32881
improve cholesky example
caugonnet Oct 8, 2025
abd5778
POTRI and Cholesky
caugonnet Oct 9, 2025
80e1085
clang-format
caugonnet Oct 9, 2025
865cf7b
Merge branch 'main' into stf_c_api
caugonnet Oct 9, 2025
4c1551a
how changes to numba-cuda have been merged
caugonnet Oct 9, 2025
77d6af1
Merge branch 'main' into stf_c_api
caugonnet Nov 14, 2025
acc8f49
Merge branch 'main' into stf_c_api
andralex Nov 14, 2025
de333b2
Fix CI precommit
andralex Nov 14, 2025
3834c8f
Merge branch 'main' into stf_c_api
andralex Nov 15, 2025
9a5c265
no need for numba.cuda.config.CUDA_ENABLE_PYNVJITLINK = 1 anymore
caugonnet Nov 24, 2025
9932a24
Merge origin/main into stf_c_api
caugonnet Nov 24, 2025
e7e2adb
Our numba-cuda fix is part of 0.21.0
caugonnet Nov 24, 2025
39040a9
Minor doc fix
caugonnet Nov 25, 2025
8f27fa2
Ensure matplotlib is only used if available
caugonnet Nov 25, 2025
73ac963
Cleanup examples
caugonnet Nov 25, 2025
d90ed64
cmake fix
caugonnet Nov 25, 2025
eb77519
Cmake fixes (need extra cleanup)
caugonnet Nov 25, 2025
b38ff80
Work-around for lazy resource init during graph capture in cuda core
caugonnet Nov 25, 2025
0a3e667
Use a relaxed capture mode
caugonnet Nov 25, 2025
8642fdd
This work-around is not needed anymore with a relaxed capture mode
caugonnet Nov 25, 2025
2a75766
Merge branch 'main' into stf_c_api
caugonnet Nov 25, 2025
0f9865d
cleanup warp example
caugonnet Nov 25, 2025
6466347
Cleanups in the cython code for STF
caugonnet Nov 25, 2025
cfb2930
no need for math.prod for such a simple thing
caugonnet Nov 26, 2025
130ee2a
Simpler code to handle vector types
caugonnet Nov 26, 2025
4bb4d23
fix grid dimension
caugonnet Nov 26, 2025
b8c745e
Use from_dlpack
caugonnet Nov 26, 2025
fb2a3ba
Change the mock-up FHE toy example to have operations that are homomo…
caugonnet Nov 26, 2025
6c2f850
Merge branch 'main' into stf_c_api
caugonnet Nov 26, 2025
da2e1aa
Add some explanation for the use of a relaxed capture mode
caugonnet Nov 26, 2025
852b400
cleaner pytorch adapter
caugonnet Nov 26, 2025
9308af5
Merge branch 'main' into stf_c_api
caugonnet Nov 27, 2025
09913dc
Code simplification
caugonnet Nov 26, 2025
237b2c1
minor fixes
caugonnet Dec 16, 2025
dd6cc26
Merge branch 'main' into stf_c_api
caugonnet Feb 3, 2026
ac148e8
Merge branch 'main' into stf_c_api
caugonnet Feb 8, 2026
5fedcfb
remove a change from main
caugonnet Feb 9, 2026
1fa449f
Merge branch 'main' into stf_c_api
caugonnet Feb 9, 2026
9839495
avoid a pre-commit fail
caugonnet Feb 9, 2026
65155d1
Include STF python bindings in CI
caugonnet Feb 9, 2026
1cce4d4
Make the script executable
caugonnet Feb 9, 2026
1dbfd64
Disable CUFILE in the python build
caugonnet Feb 9, 2026
5545ffb
Attempt to fix compilation on aarch64
caugonnet Feb 9, 2026
291e00c
fix a type conversion issue
caugonnet Feb 9, 2026
3a12081
Merge branch 'main' into stf_c_api
caugonnet Feb 9, 2026
4b54abc
try to fix python packages
caugonnet Feb 9, 2026
97d9b8b
Merge branch 'main' into stf_c_api
caugonnet Feb 9, 2026
5f950c4
gersemi pre-commit hook
caugonnet Feb 10, 2026
8727b24
Conditionally provide the jit decorator if numba-cuda is available
caugonnet Feb 10, 2026
f4c8800
clang-format
caugonnet Feb 10, 2026
ac980ec
Skip STF with MSVC in CI
caugonnet Feb 11, 2026
4821ebd
Merge branch 'main' into stf_c_api
caugonnet Feb 11, 2026
8559e8c
More consistent examples
caugonnet Feb 12, 2026
698739e
pre-commit hooks
caugonnet Feb 12, 2026
4d73287
Add missing copyrights
caugonnet Feb 12, 2026
97ae928
add missing file
caugonnet Feb 12, 2026
6903af7
like_empty -> empty_like
caugonnet Feb 12, 2026
99655d3
Report if the STF bindings cannot be loaded
caugonnet Feb 12, 2026
096ea44
Avoid a global context variable in fhe tests
caugonnet Feb 12, 2026
08fa67d
support an optional name= field in logical_data init methods to have …
caugonnet Feb 12, 2026
5145dff
more consistent aliases in example
caugonnet Feb 12, 2026
cd51231
Fix cmake message
caugonnet Feb 12, 2026
5245067
Remove commented debug leftovers
caugonnet Feb 12, 2026
c055d52
Merge branch 'main' into stf_c_api
caugonnet Feb 12, 2026
79be7ec
fix string format
caugonnet Feb 12, 2026
606896d
Do not tamper HOST_COMPILER
caugonnet Feb 12, 2026
da0487e
Use the existing mechanism to cleanly exclude the test_py_stf job fro…
caugonnet Feb 12, 2026
0fd485a
Merge branch 'main' into stf_c_api
caugonnet Feb 12, 2026
9d8ed4b
Merge branch 'main' into stf_c_api
caugonnet Feb 13, 2026
d9b1ca5
Merge branch 'main' into stf_c_api
caugonnet Feb 13, 2026
f78290a
Merge branch 'main' into stf_c_api
caugonnet Feb 13, 2026
da27328
Restore C in STF's C lib
caugonnet Feb 13, 2026
fa4cc26
Experiment with composite data places and vmm allocations
caugonnet Feb 13, 2026
b576c35
Ensure a CUDA context exists when creating a localized_array
caugonnet Feb 14, 2026
f5ea347
Merge branch 'main' into stf_composite_places
caugonnet Feb 14, 2026
9b21b71
Some comment to explain why we have get_composite_alloc_registry
caugonnet Feb 14, 2026
8adb830
Simplify how we create and use a grid of green contexts
caugonnet Feb 14, 2026
a40d691
no need for a stream and add a check
caugonnet Feb 14, 2026
8624528
Add a test to use a data_place to create an allocator for thrust_device
caugonnet Feb 14, 2026
4599dcb
clang-format
caugonnet Feb 14, 2026
6493be1
Remove some useless comment
caugonnet Feb 14, 2026
1599b07
fix compilation
caugonnet Feb 14, 2026
f22aa6b
Merge branch 'main' into stf_c_api
caugonnet Feb 23, 2026
e4bafa9
Use cuda.core.Buffer.fill (except for 8 bytes values) instead of cupy…
caugonnet Feb 25, 2026
a0c8227
Move fill utilities
caugonnet Feb 25, 2026
2e63918
Make pytorch_task a free function and move it to the test directory
caugonnet Feb 25, 2026
b21d930
wrappers to build pytorch tensors outside of cuda.stf
caugonnet Feb 25, 2026
de6f2f8
Remove dead code
caugonnet Feb 25, 2026
2f89cc1
Move numba utilities outside of the core cuda.stf
caugonnet Feb 25, 2026
1df9b9b
clang-format
caugonnet Feb 25, 2026
97ef675
Move the jit numba decorator in tests too
caugonnet Feb 25, 2026
16e8eec
Add missing file
caugonnet Feb 25, 2026
9fc3250
Merge branch 'main' into stf_c_api
caugonnet Feb 25, 2026
a516a2e
Only keep a cupy fallback to fill 8bytes values, not both cupy and numba
caugonnet Feb 25, 2026
ae43907
Some details about the stf_cai for CAI v3
caugonnet Feb 25, 2026
64cf200
Use relative paths to fix tests in CI
caugonnet Feb 25, 2026
b7abbac
pre-commit hooks
caugonnet Feb 25, 2026
daae555
Add a doc for cuda.stf
caugonnet Feb 25, 2026
0089958
Ensure cuda.stf is usable
caugonnet Feb 26, 2026
201e198
Merge branch 'main' into stf_c_api
caugonnet Feb 26, 2026
96c2421
Try to fix cuda.stf CI
caugonnet Feb 26, 2026
f6d5c2a
Merge branch 'main' into stf_c_api
caugonnet Feb 26, 2026
664f61d
remove __init__.py from test/stf to avoid confusion between libs
caugonnet Feb 26, 2026
5d735d7
Merge branch 'main' into stf_c_api
caugonnet Feb 26, 2026
5ec3769
Experiments with thrust and memory resources
caugonnet Feb 27, 2026
e267ad0
Merge branch 'main' into stf_composite_places
caugonnet Feb 27, 2026
aab32d3
Add some comments to clarify intents
caugonnet Feb 27, 2026
5f0b044
Merge branch 'main' into stf_c_api
caugonnet Feb 28, 2026
cf7c11e
pre-commit hooks
caugonnet Feb 28, 2026
2570bae
Merge branch 'main' into stf_c_api
caugonnet Mar 2, 2026
62404a6
Merge branch 'main' into stf_c_api
caugonnet Mar 3, 2026
c7b6aab
Merge branch 'main' into stf_c_api
caugonnet Mar 9, 2026
b0df198
Merge branch 'main' into stf_c_api
caugonnet Mar 10, 2026
1f9b89a
There should be no __init__.py file here, otherwise tests becomes a p…
caugonnet Mar 10, 2026
94bdd96
Merge branch 'main' into stf_c_api
caugonnet Mar 10, 2026
832fd76
Disable SASS verification for tests which might generate LDL instruct…
caugonnet Mar 10, 2026
52fb401
Merge branch 'main' into stf_c_api
caugonnet Mar 10, 2026
5156a03
Merge branch 'main' into stf_c_api
caugonnet Mar 10, 2026
83a409e
Merge branch 'stf_c_api' into stf_composite_places
caugonnet Mar 10, 2026
a1aba8e
Update copyright
caugonnet Mar 11, 2026
fea6295
Use the simplified green_context_helper API
caugonnet Mar 11, 2026
a315a13
fix copyright year
caugonnet Mar 11, 2026
f1eabb7
Use the simplified green_context_helper API
caugonnet Mar 11, 2026
80fe8f2
Merge branch 'main' into stf_composite_places
caugonnet Mar 11, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion ci/matrix.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -325,6 +325,8 @@ workflows:
exclude:
# GPU runners are not available on Windows.
- {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
# STF C API and Python bindings are not built for MSVC:
- {jobs: ['test_py_stf'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
# cudax doesn't support C++17 on msvc:
- {project: 'cudax', std: 17, cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}

Expand Down Expand Up @@ -478,6 +480,7 @@ jobs:
test_py_coop: { name: "Test cuda.coop", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
test_py_par: { name: "Test cuda.compute", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
test_py_examples: { name: "Test cuda.cccl.examples", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
test_py_stf: { name: "Test cuda.stf", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_stf'} }

# Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
run_cpu: { gpu: false }
Expand Down Expand Up @@ -535,7 +538,7 @@ projects:
name: "Python"
job_map:
build: ['build_py_wheel']
test: ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples']
test: ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples', 'test_py_stf']
cccl_c_parallel:
name: 'CCCL C Parallel'
stds: [20]
Expand Down
30 changes: 30 additions & 0 deletions ci/test_cuda_stf_python.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#!/bin/bash

set -euo pipefail

ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
source "$ci_dir/pyenv_helper.sh"

# Parse common arguments
source "$ci_dir/util/python/common_arg_parser.sh"
parse_python_args "$@"
cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)

# Setup Python environment
setup_python_env "${py_version}"

# Fetch or build the cuda_cccl wheel:
if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
"$ci_dir/util/artifacts/download.sh" ${wheel_artifact_name} /home/coder/cccl/
else
"$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
fi

# Install cuda_cccl
CUDA_CCCL_WHEEL_PATH="$(ls /home/coder/cccl/wheelhouse/cuda_cccl-*.whl)"
python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]"

# Run tests for STF module
cd "/home/coder/cccl/python/cuda_cccl/tests/"
python -m pytest -n auto -v stf/
1 change: 1 addition & 0 deletions cudax/examples/stf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ set(
08-cub-reduce.cu
axpy-annotated.cu
void_data_interface.cu
thrust_device_data_place_allocator.cu
explicit_data_places.cu
thrust_zip_iterator.cu
1f1b.cu
Expand Down
130 changes: 130 additions & 0 deletions cudax/examples/stf/thrust_device_data_place_allocator.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
//===----------------------------------------------------------------------===//
//
// Part of CUDASTF in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

/**
* @file
*
* @brief Example: Thrust device_vector with an allocator backed by a data_place.
* Uses thrust::mr::memory_resource to wrap data_place, then
* thrust::mr::allocator to create a compatible allocator.
* Storage is allocated via data_place::allocate (device, composite/VMM,
* or other place types). The same Thrust code works unchanged for
* single-device, multi-device (VMM), or green-context placement.
*/

#include <thrust/copy.h>
#include <thrust/device_ptr.h>
#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/host_vector.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/mr/allocator.h>
#include <thrust/mr/memory_resource.h>
#include <thrust/transform.h>

#include <cuda/experimental/__stf/places/blocked_partition.cuh>
#include <cuda/experimental/__stf/places/exec/green_context.cuh>
#include <cuda/experimental/stf.cuh>

#include <iostream>

using namespace cuda::experimental::stf;

// Minimal adapter: data_place is STF's abstraction; Thrust expects a
// memory_resource. This class bridges the two. The resource must outlive
// any vectors/allocators that use it.
class data_place_memory_resource final : public thrust::mr::memory_resource<thrust::device_ptr<void>>
{
public:
explicit data_place_memory_resource(const data_place& place)
: place_(place)
{}

pointer do_allocate(std::size_t bytes, std::size_t /*alignment*/) override
{
void* raw = place_.allocate(static_cast<std::ptrdiff_t>(bytes));
return thrust::device_ptr<void>(raw);
}

void do_deallocate(pointer p, std::size_t bytes, std::size_t /*alignment*/) override
{
place_.deallocate(p.get(), bytes);
}

bool do_is_equal(const memory_resource& other) const noexcept override
{
auto* o = dynamic_cast<const data_place_memory_resource*>(&other);
return o && place_ == o->place_;
}

private:
data_place place_;
};

template <typename T>
using data_place_allocator = thrust::mr::allocator<T, data_place_memory_resource>;

// Run the Thrust example with the given data_place; returns true if the check passed.
bool run_with_place(const data_place& place, const char* label)
{
const size_t n = 1024 * 1024;

data_place_memory_resource memres(place);
data_place_allocator<double> alloc(&memres);
thrust::device_vector<double, data_place_allocator<double>> d_vec(n, 0.0, alloc);

thrust::transform(
thrust::device,
thrust::counting_iterator<size_t>(0),
thrust::counting_iterator<size_t>(n),
d_vec.begin(),
[] _CCCL_DEVICE(size_t i) {
return 2.0 * static_cast<double>(i);
});

thrust::host_vector<double> h_sample(4);
thrust::copy(d_vec.begin(), d_vec.begin() + 4, h_sample.begin());

bool ok = (h_sample[0] == 0.0 && h_sample[1] == 2.0 && h_sample[2] == 4.0 && h_sample[3] == 6.0);
if (!ok)
{
std::cerr << "thrust_device_data_place_allocator: " << label << " (" << place.to_string() << "): FAILED\n";
}
return ok;
}

int main()
{
bool all_ok = true;

// Device 0
all_ok &= run_with_place(data_place::device(0), "device(0)");

// All devices (composite, VMM path when multiple devices)
all_ok &= run_with_place(data_place::composite(blocked_partition(), exec_place::all_devices()),
"composite(blocked_partition, all_devices)");

#if _CCCL_CTK_AT_LEAST(12, 4)
// Example based on a grid of green contexts where we use a data place per green context
{
const int num_sms = 8;
const int dev_id = 0;
green_context_helper gc_helper(num_sms, dev_id);
if (gc_helper.get_count() >= 1)
{
auto where = gc_helper.get_grid(true);
data_place cdp = data_place::composite(blocked_partition(), where);
all_ok &= run_with_place(cdp, "composite(blocked_partition, green_context_grid)");
}
}
#endif

return all_ok ? 0 : 1;
}
4 changes: 4 additions & 0 deletions cudax/examples/stf/void_data_interface.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,5 +49,9 @@ int main()
return cuda_kernel_desc{dummy_kernel, 16, 128, 0};
};

EXPECT(token.is_void_interface());
EXPECT(token2.is_void_interface());
EXPECT(token3.is_void_interface());

ctx.finalize();
}
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <cuda/experimental/__stf/utility/memory.cuh>
#include <cuda/experimental/__stf/utility/traits.cuh>

#include <array>
#include <list>
#include <random>
#include <unordered_map>
Expand Down Expand Up @@ -87,23 +88,24 @@ public:
, data_dims(data_dims)
, elemsize(elemsize)
{
// Ensure a current CUDA context exists so cuCtxGetDevice() and other driver
// APIs succeed (e.g. when no stream_ctx was used or after primary ctx release).
cuda_safe_call(cudaFree(nullptr));

// Regardless of the grid, we allow all devices to access that localized array
const int ndevs = cuda_try<cudaGetDeviceCount>();
CUdevice dev = cuda_try<cuCtxGetDevice>();

/* Check whether the current device supports UVA */
int supportsVMM = cuda_try<cuDeviceGetAttribute>(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev);
// fprintf(stderr, "VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED ? %d\n", supportsVMM);
EXPECT(supportsVMM == 1, "Cannot create a localized_array object on this machine because it does not support VMM.");

/* Get allocation granularity */

CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = dev};

size_t alloc_granularity_bytes = cuda_try<cuMemGetAllocationGranularity>(&prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
// fprintf(stderr, "GRANULARITY = %ld KB\n", alloc_granularity_bytes / 1024);

// To make our life simpler for now: we assume that we only allocate full blocks
block_size_bytes = alloc_granularity_bytes;
Expand All @@ -117,9 +119,6 @@ public:
// Reserve a range of virtual addresses, round up size to accommodate granularity requirements
cuda_try(cuMemAddressReserve(&base_ptr, vm_total_size_bytes, 0ULL, 0ULL, 0ULL));

// fprintf(stderr, "cuMemAddressReserve => %p + %ld (%ld KB)\n", (void *)base_ptr, vm_total_size_bytes,
// vm_total_size_bytes / 1024);

::std::vector<CUmemAccessDesc> accessDesc(ndevs);
for (int d = 0; d < ndevs; d++)
{
Expand Down Expand Up @@ -219,7 +218,6 @@ public:

// Print visual block map (compact representation)
fprintf(stderr, "\nBlock ownership map (each char = 1 block, 0-9/a-z = place index):\n ");
// Build a map of place names to single-char indices
::std::unordered_map<::std::string, char> place_to_char;
char next_char = '0';
for (size_t i = 0; i < nblocks; i++)
Expand All @@ -245,7 +243,6 @@ public:
}
fprintf(stderr, "\n");

// Print legend
fprintf(stderr, "\n Legend:\n");
for (const auto& entry : place_to_char)
{
Expand All @@ -255,10 +252,7 @@ public:
fprintf(stderr, "==============================================\n\n");
}

// fprintf(stderr, "GOT %ld effective blocks (%ld blocks)\n", nblocks_effective, nblocks);

// Create a physical allocation per block, this is not mapped in
// virtual memory yet.
// Create a physical allocation per block, this is not mapped in virtual memory yet.
for (auto& item : meta)
{
int item_dev = device_ordinal(item.place);
Expand Down Expand Up @@ -289,7 +283,6 @@ public:
}
}
}
// fprintf(stderr, "localized_array (this = %p) : nblocks_effective %ld\n", this, nblocks_effective);
}

localized_array() = delete;
Expand All @@ -300,8 +293,6 @@ public:

~localized_array()
{
// fprintf(stderr, "~localized_array (this = %p) ... base ptr %p vm_total_size_bytes %ld - nblocks_effective
// %ld\n", this, (void *)base_ptr, vm_total_size_bytes, nblocks_effective);
for (auto& item : meta)
{
size_t offset = item.offset;
Expand Down Expand Up @@ -404,8 +395,6 @@ private:
stats.total_samples += nsamples;
stats.matching_samples += max_cnt;

// ::std::cout << "GOT BEST POS for offset " << linearized_index << " -> " << max_pos.string() << ::std::endl;

return max_pos;
#endif
}
Expand Down Expand Up @@ -486,4 +475,34 @@ public:
private:
reserved::linear_pool<localized_array> cache;
};

// Registry for composite data_place::allocate/deallocate (ownership of localized_array by base pointer)
// This is how we can retrieve the localized_array descriptor when calling
// deallocate with the device address returned by allocate.
inline ::std::unordered_map<void*, ::std::unique_ptr<localized_array>>& get_composite_alloc_registry()
{
static ::std::unordered_map<void*, ::std::unique_ptr<localized_array>> reg;
return reg;
}

inline void* allocate_composite_data_place(const data_place& p, ::std::ptrdiff_t size)
{
EXPECT(p.is_composite());
const size_t size_u = static_cast<size_t>(size);
const exec_place_grid& grid = p.get_grid();
const get_executor_func_t& mapper = p.get_partitioner();
auto delinearize_1d = [](size_t i) {
return pos4(static_cast<ssize_t>(i), 0, 0, 0);
};
auto arr = ::std::make_unique<localized_array>(grid, mapper, delinearize_1d, size_u, 1, dim4(size_u));
void* ptr = arr->get_base_ptr();
get_composite_alloc_registry()[ptr] = ::std::move(arr);
return ptr;
}

inline void deallocate_composite_data_place(void* ptr)
{
// Cleanup of the actual array resources (VMM resources) is handled in the destructor of localized_array.
get_composite_alloc_registry().erase(ptr);
}
} // end namespace cuda::experimental::stf::reserved
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ public:
return ctxs[partition];
}

green_ctx_view get_view(size_t id)
green_ctx_view get_view(size_t id) const
{
return green_ctx_view(ctxs[id], pools[id], devid);
}
Expand All @@ -269,6 +269,26 @@ public:
return ctxs.size();
}

/** @brief Build a grid of exec places from this helper's green contexts.
*
* The green contexts are already created by the helper; this only chooses how
* each exec place's affine data place is represented.
*
* @param use_green_ctx_data_place If true, each place's affine data place is the
* green context extension; if false, the default device data place.
* @return exec_place_grid of green context places.
*/
exec_place_grid get_grid(bool use_green_ctx_data_place = false) const
{
::std::vector<exec_place> places;
places.reserve(ctxs.size());
for (size_t i = 0; i < ctxs.size(); i++)
{
places.push_back(exec_place::green_ctx(get_view(i), use_green_ctx_data_place));
}
return make_grid(mv(places));
}

private:
friend class exec_place;

Expand Down
Loading
Loading