diff --git a/ci/matrix.yaml b/ci/matrix.yaml index 237f6a2f681..4acbb308f90 100644 --- a/ci/matrix.yaml +++ b/ci/matrix.yaml @@ -325,6 +325,8 @@ workflows: exclude: # GPU runners are not available on Windows. - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']} + # STF C API and Python bindings are not built for MSVC: + - {jobs: ['test_py_stf'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']} # cudax doesn't support C++17 on msvc: - {project: 'cudax', std: 17, cxx: ['msvc2019', 'msvc14.39', 'msvc2022']} @@ -478,6 +480,7 @@ jobs: test_py_coop: { name: "Test cuda.coop", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} } test_py_par: { name: "Test cuda.compute", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} } test_py_examples: { name: "Test cuda.cccl.examples", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} } + test_py_stf: { name: "Test cuda.stf", gpu: true, needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_stf'} } # Run jobs for 'target' project (ci/util/build_and_test_targets.sh): run_cpu: { gpu: false } @@ -535,7 +538,7 @@ projects: name: "Python" job_map: build: ['build_py_wheel'] - test: ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples'] + test: ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples', 'test_py_stf'] cccl_c_parallel: name: 'CCCL C Parallel' stds: [20] diff --git a/ci/test_cuda_stf_python.sh b/ci/test_cuda_stf_python.sh new file mode 100755 index 00000000000..c686c3859aa --- /dev/null +++ b/ci/test_cuda_stf_python.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +set -euo pipefail + +ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +source "$ci_dir/pyenv_helper.sh" + +# Parse common arguments +source "$ci_dir/util/python/common_arg_parser.sh" +parse_python_args "$@" +cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2) + +# Setup Python environment +setup_python_env "${py_version}" + +# Fetch or build the cuda_cccl wheel: +if [[ -n "${GITHUB_ACTIONS:-}" ]]; then + wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh") + "$ci_dir/util/artifacts/download.sh" ${wheel_artifact_name} /home/coder/cccl/ +else + "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}" +fi + +# Install cuda_cccl +CUDA_CCCL_WHEEL_PATH="$(ls /home/coder/cccl/wheelhouse/cuda_cccl-*.whl)" +python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]" + +# Run tests for STF module +cd "/home/coder/cccl/python/cuda_cccl/tests/" +python -m pytest -n auto -v stf/ diff --git a/cudax/examples/stf/CMakeLists.txt b/cudax/examples/stf/CMakeLists.txt index 48c52ca69bc..be43e9129ab 100644 --- a/cudax/examples/stf/CMakeLists.txt +++ b/cudax/examples/stf/CMakeLists.txt @@ -10,6 +10,7 @@ set( 08-cub-reduce.cu axpy-annotated.cu void_data_interface.cu + thrust_device_data_place_allocator.cu explicit_data_places.cu thrust_zip_iterator.cu 1f1b.cu diff --git a/cudax/examples/stf/thrust_device_data_place_allocator.cu b/cudax/examples/stf/thrust_device_data_place_allocator.cu new file mode 100644 index 00000000000..34f8cea13a9 --- /dev/null +++ b/cudax/examples/stf/thrust_device_data_place_allocator.cu @@ -0,0 +1,130 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Example: Thrust device_vector with an allocator backed by a data_place. + * Uses thrust::mr::memory_resource to wrap data_place, then + * thrust::mr::allocator to create a compatible allocator. + * Storage is allocated via data_place::allocate (device, composite/VMM, + * or other place types). The same Thrust code works unchanged for + * single-device, multi-device (VMM), or green-context placement. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +using namespace cuda::experimental::stf; + +// Minimal adapter: data_place is STF's abstraction; Thrust expects a +// memory_resource. This class bridges the two. The resource must outlive +// any vectors/allocators that use it. +class data_place_memory_resource final : public thrust::mr::memory_resource> +{ +public: + explicit data_place_memory_resource(const data_place& place) + : place_(place) + {} + + pointer do_allocate(std::size_t bytes, std::size_t /*alignment*/) override + { + void* raw = place_.allocate(static_cast(bytes)); + return thrust::device_ptr(raw); + } + + void do_deallocate(pointer p, std::size_t bytes, std::size_t /*alignment*/) override + { + place_.deallocate(p.get(), bytes); + } + + bool do_is_equal(const memory_resource& other) const noexcept override + { + auto* o = dynamic_cast(&other); + return o && place_ == o->place_; + } + +private: + data_place place_; +}; + +template +using data_place_allocator = thrust::mr::allocator; + +// Run the Thrust example with the given data_place; returns true if the check passed. +bool run_with_place(const data_place& place, const char* label) +{ + const size_t n = 1024 * 1024; + + data_place_memory_resource memres(place); + data_place_allocator alloc(&memres); + thrust::device_vector> d_vec(n, 0.0, alloc); + + thrust::transform( + thrust::device, + thrust::counting_iterator(0), + thrust::counting_iterator(n), + d_vec.begin(), + [] _CCCL_DEVICE(size_t i) { + return 2.0 * static_cast(i); + }); + + thrust::host_vector h_sample(4); + thrust::copy(d_vec.begin(), d_vec.begin() + 4, h_sample.begin()); + + bool ok = (h_sample[0] == 0.0 && h_sample[1] == 2.0 && h_sample[2] == 4.0 && h_sample[3] == 6.0); + if (!ok) + { + std::cerr << "thrust_device_data_place_allocator: " << label << " (" << place.to_string() << "): FAILED\n"; + } + return ok; +} + +int main() +{ + bool all_ok = true; + + // Device 0 + all_ok &= run_with_place(data_place::device(0), "device(0)"); + + // All devices (composite, VMM path when multiple devices) + all_ok &= run_with_place(data_place::composite(blocked_partition(), exec_place::all_devices()), + "composite(blocked_partition, all_devices)"); + +#if _CCCL_CTK_AT_LEAST(12, 4) + // Example based on a grid of green contexts where we use a data place per green context + { + const int num_sms = 8; + const int dev_id = 0; + green_context_helper gc_helper(num_sms, dev_id); + if (gc_helper.get_count() >= 1) + { + auto where = gc_helper.get_grid(true); + data_place cdp = data_place::composite(blocked_partition(), where); + all_ok &= run_with_place(cdp, "composite(blocked_partition, green_context_grid)"); + } + } +#endif + + return all_ok ? 0 : 1; +} diff --git a/cudax/examples/stf/void_data_interface.cu b/cudax/examples/stf/void_data_interface.cu index 0340b16bf4a..bf429f23dfe 100644 --- a/cudax/examples/stf/void_data_interface.cu +++ b/cudax/examples/stf/void_data_interface.cu @@ -49,5 +49,9 @@ int main() return cuda_kernel_desc{dummy_kernel, 16, 128, 0}; }; + EXPECT(token.is_void_interface()); + EXPECT(token2.is_void_interface()); + EXPECT(token3.is_void_interface()); + ctx.finalize(); } diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh index 86275b4e8ce..0bf6c8bd9f9 100644 --- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh +++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -87,23 +88,24 @@ public: , data_dims(data_dims) , elemsize(elemsize) { + // Ensure a current CUDA context exists so cuCtxGetDevice() and other driver + // APIs succeed (e.g. when no stream_ctx was used or after primary ctx release). + cuda_safe_call(cudaFree(nullptr)); + // Regardless of the grid, we allow all devices to access that localized array const int ndevs = cuda_try(); CUdevice dev = cuda_try(); /* Check whether the current device supports UVA */ int supportsVMM = cuda_try(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev); - // fprintf(stderr, "VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED ? %d\n", supportsVMM); EXPECT(supportsVMM == 1, "Cannot create a localized_array object on this machine because it does not support VMM."); /* Get allocation granularity */ - CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = dev}; size_t alloc_granularity_bytes = cuda_try(&prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM); - // fprintf(stderr, "GRANULARITY = %ld KB\n", alloc_granularity_bytes / 1024); // To make our life simpler for now: we assume that we only allocate full blocks block_size_bytes = alloc_granularity_bytes; @@ -117,9 +119,6 @@ public: // Reserve a range of virtual addresses, round up size to accommodate granularity requirements cuda_try(cuMemAddressReserve(&base_ptr, vm_total_size_bytes, 0ULL, 0ULL, 0ULL)); - // fprintf(stderr, "cuMemAddressReserve => %p + %ld (%ld KB)\n", (void *)base_ptr, vm_total_size_bytes, - // vm_total_size_bytes / 1024); - ::std::vector accessDesc(ndevs); for (int d = 0; d < ndevs; d++) { @@ -219,7 +218,6 @@ public: // Print visual block map (compact representation) fprintf(stderr, "\nBlock ownership map (each char = 1 block, 0-9/a-z = place index):\n "); - // Build a map of place names to single-char indices ::std::unordered_map<::std::string, char> place_to_char; char next_char = '0'; for (size_t i = 0; i < nblocks; i++) @@ -245,7 +243,6 @@ public: } fprintf(stderr, "\n"); - // Print legend fprintf(stderr, "\n Legend:\n"); for (const auto& entry : place_to_char) { @@ -255,10 +252,7 @@ public: fprintf(stderr, "==============================================\n\n"); } - // fprintf(stderr, "GOT %ld effective blocks (%ld blocks)\n", nblocks_effective, nblocks); - - // Create a physical allocation per block, this is not mapped in - // virtual memory yet. + // Create a physical allocation per block, this is not mapped in virtual memory yet. for (auto& item : meta) { int item_dev = device_ordinal(item.place); @@ -289,7 +283,6 @@ public: } } } - // fprintf(stderr, "localized_array (this = %p) : nblocks_effective %ld\n", this, nblocks_effective); } localized_array() = delete; @@ -300,8 +293,6 @@ public: ~localized_array() { - // fprintf(stderr, "~localized_array (this = %p) ... base ptr %p vm_total_size_bytes %ld - nblocks_effective - // %ld\n", this, (void *)base_ptr, vm_total_size_bytes, nblocks_effective); for (auto& item : meta) { size_t offset = item.offset; @@ -404,8 +395,6 @@ private: stats.total_samples += nsamples; stats.matching_samples += max_cnt; - // ::std::cout << "GOT BEST POS for offset " << linearized_index << " -> " << max_pos.string() << ::std::endl; - return max_pos; #endif } @@ -486,4 +475,34 @@ public: private: reserved::linear_pool cache; }; + +// Registry for composite data_place::allocate/deallocate (ownership of localized_array by base pointer) +// This is how we can retrieve the localized_array descriptor when calling +// deallocate with the device address returned by allocate. +inline ::std::unordered_map>& get_composite_alloc_registry() +{ + static ::std::unordered_map> reg; + return reg; +} + +inline void* allocate_composite_data_place(const data_place& p, ::std::ptrdiff_t size) +{ + EXPECT(p.is_composite()); + const size_t size_u = static_cast(size); + const exec_place_grid& grid = p.get_grid(); + const get_executor_func_t& mapper = p.get_partitioner(); + auto delinearize_1d = [](size_t i) { + return pos4(static_cast(i), 0, 0, 0); + }; + auto arr = ::std::make_unique(grid, mapper, delinearize_1d, size_u, 1, dim4(size_u)); + void* ptr = arr->get_base_ptr(); + get_composite_alloc_registry()[ptr] = ::std::move(arr); + return ptr; +} + +inline void deallocate_composite_data_place(void* ptr) +{ + // Cleanup of the actual array resources (VMM resources) is handled in the destructor of localized_array. + get_composite_alloc_registry().erase(ptr); +} } // end namespace cuda::experimental::stf::reserved diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh index 2ce9691a0ba..8e55bd287a6 100644 --- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh +++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh @@ -253,7 +253,7 @@ public: return ctxs[partition]; } - green_ctx_view get_view(size_t id) + green_ctx_view get_view(size_t id) const { return green_ctx_view(ctxs[id], pools[id], devid); } @@ -269,6 +269,26 @@ public: return ctxs.size(); } + /** @brief Build a grid of exec places from this helper's green contexts. + * + * The green contexts are already created by the helper; this only chooses how + * each exec place's affine data place is represented. + * + * @param use_green_ctx_data_place If true, each place's affine data place is the + * green context extension; if false, the default device data place. + * @return exec_place_grid of green context places. + */ + exec_place_grid get_grid(bool use_green_ctx_data_place = false) const + { + ::std::vector places; + places.reserve(ctxs.size()); + for (size_t i = 0; i < ctxs.size(); i++) + { + places.push_back(exec_place::green_ctx(get_view(i), use_green_ctx_data_place)); + } + return make_grid(mv(places)); + } + private: friend class exec_place; diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh index d2de042452e..5f75d5b109d 100644 --- a/cudax/include/cuda/experimental/__stf/places/places.cuh +++ b/cudax/include/cuda/experimental/__stf/places/places.cuh @@ -60,6 +60,14 @@ class exec_place_green_ctx; //! Function type for computing executor placement from data coordinates using get_executor_func_t = pos4 (*)(pos4, dim4, dim4); +class data_place; + +namespace reserved +{ +void* allocate_composite_data_place(const data_place& p, ::std::ptrdiff_t size); +void deallocate_composite_data_place(void* ptr); +} // namespace reserved + /** * @brief Designates where data will be stored (CPU memory vs. on device 0 (first GPU), device 1 (second GPU), ...) * @@ -478,10 +486,13 @@ public: { cuda_safe_call(cudaMallocManaged(&result, size)); } + else if (is_composite()) + { + return reserved::allocate_composite_data_place(*this, size); + } else { // Device allocation - EXPECT(!is_composite(), "Composite places don't support direct allocation"); const int prev_dev = cuda_try(); const int target_dev = devid; @@ -524,6 +535,12 @@ public: return; } + if (is_composite()) + { + reserved::deallocate_composite_data_place(ptr); + return; + } + if (is_host()) { cuda_safe_call(cudaFreeHost(ptr)); @@ -572,6 +589,11 @@ public: { return extension->allocation_is_stream_ordered(); } + // Composite uses immediate VMM path, not stream-ordered APIs + if (is_composite()) + { + return false; + } // Host and managed are immediate (stream ignored), device is stream-ordered return !is_host() && !is_managed(); } @@ -2194,3 +2216,5 @@ UNITTEST("Exec place as std::map key") }; #endif // UNITTESTED_FILE } // end namespace cuda::experimental::stf + +#include diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt index dd78f1c5f15..51df41b13d3 100644 --- a/cudax/test/stf/CMakeLists.txt +++ b/cudax/test/stf/CMakeLists.txt @@ -122,6 +122,7 @@ set( green_context/axpy_gc.cu green_context/cuda_graph.cu green_context/gc_grid.cu + green_context/gc_grid_vmm.cu hash/ctx_hash.cu hash/logical_data.cu hashtable/fusion.cu diff --git a/cudax/test/stf/cpp/task_get_stream.cu b/cudax/test/stf/cpp/task_get_stream.cu index 89fa74e7490..2d6509e5a73 100644 --- a/cudax/test/stf/cpp/task_get_stream.cu +++ b/cudax/test/stf/cpp/task_get_stream.cu @@ -24,7 +24,8 @@ void test_stream() context ctx; auto token = ctx.token(); - auto t = ctx.task(token.write()); + EXPECT(token.is_void_interface()); + auto t = ctx.task(token.write()); t.start(); cudaStream_t s = t.get_stream(); EXPECT(s != nullptr); diff --git a/cudax/test/stf/green_context/gc_grid_vmm.cu b/cudax/test/stf/green_context/gc_grid_vmm.cu new file mode 100644 index 00000000000..313701ee3b7 --- /dev/null +++ b/cudax/test/stf/green_context/gc_grid_vmm.cu @@ -0,0 +1,100 @@ +//===----------------------------------------------------------------------===// +// +// Part of CUDASTF in CUDA C++ Core Libraries, +// under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES. +// +//===----------------------------------------------------------------------===// + +/** + * @file + * + * @brief Test that a grid of green contexts with explicit affine data places + * (green context data place extension) and a blocked policy forces + * the VMM path for composite allocation. + * + * When the grid is made of exec_place::green_ctx(..., true), each place's + * affine data place is the green context extension. Forming a composite + * data place from that grid and using the modular API (cdp.allocate(n), + * cdp.deallocate(ptr, n)) triggers the localized_array (VMM) path. + */ + +#include +#include +#include + +#include + +using namespace cuda::experimental::stf; + +#if _CCCL_CTK_AT_LEAST(12, 4) +__global__ void init_kernel(double* ptr, size_t n) +{ + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) + { + ptr[i] = static_cast(i); + } +} + +void run_gc_grid_vmm_test() +{ + const int num_sms = 8; + const int dev_id = 0; + green_context_helper gc_helper(num_sms, dev_id); + + if (gc_helper.get_count() < 1) + { + fprintf(stderr, "No green contexts available, skipping VMM path test.\n"); + return; + } + + // Grid of green context places with explicit affine data places (VMM path). + auto where = gc_helper.get_grid(true); + + // Composite data place with blocked partitioner: allocation on this place + // uses the VMM path (localized_array). + data_place cdp = data_place::composite(blocked_partition(), where); + + EXPECT(cdp.is_composite()); + EXPECT(!cdp.allocation_is_stream_ordered()); + + // 8MB allocation to have enough blocks to test the VMM path. + const size_t n = 1024 * 1024; + const size_t size_bytes = n * sizeof(double); + + // Allocate directly on the composite place (VMM path) + void* ptr = cdp.allocate(size_bytes); + EXPECT(ptr != nullptr); + + double* d_ptr = static_cast(ptr); + // Launch on default stream + int block = 256; + int grid = static_cast((n + block - 1) / block); + init_kernel<<>>(d_ptr, n); + cuda_safe_call(cudaDeviceSynchronize()); + + // Check initialized values: load entire buffer and verify every entry + ::std::vector h_buf(n); + cuda_safe_call(cudaMemcpy(h_buf.data(), d_ptr, size_bytes, cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < n; i++) + { + EXPECT(h_buf[i] == static_cast(i)); + } + + cdp.deallocate(ptr, size_bytes); +} +#endif // _CCCL_CTK_AT_LEAST(12, 4) + +int main() +{ +#if _CCCL_CTK_BELOW(12, 4) + fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n"); + return 0; +#else + run_gc_grid_vmm_test(); + return 0; +#endif +} diff --git a/docs/python/index.rst b/docs/python/index.rst index 24995ab1b3d..57e0afde188 100644 --- a/docs/python/index.rst +++ b/docs/python/index.rst @@ -14,6 +14,9 @@ abstractions for CUDA Python developers. * :doc:`cuda.coop ` — Cooperative block- and warp-level algorithms for writing highly efficient CUDA kernels with `Numba CUDA `_. +* :doc:`cuda.stf ` — Sequential Task Flow for CUDA: define logical data and + tasks with read/write annotations; STF orchestrates execution and data movement. + These libraries expose the generic, highly-optimized algorithms from the `CCCL C++ libraries `_, which have been tuned to provide optimal performance across GPU architectures. @@ -34,5 +37,6 @@ Who is this for? setup compute coop + stf resources api_reference diff --git a/docs/python/stf.rst b/docs/python/stf.rst new file mode 100644 index 00000000000..3144976cdee --- /dev/null +++ b/docs/python/stf.rst @@ -0,0 +1,97 @@ +.. _cccl-python-stf: + +``cuda.stf``: Sequential Task Flow +================================== + +The ``cuda.stf`` module provides a Python binding to the **Sequential Task Flow (STF)** +model for CUDA: you define logical data and submit tasks that read or write that data; +STF infers dependencies and orchestrates execution and data movement. For the full +description of the model, see the :ref:`C++ CUDASTF documentation `. + +Example +------- + +The following example creates a context, registers three arrays as logical data, and +submits four tasks with different read/write annotations. STF orders the tasks so that +dependencies are respected (e.g. the task that writes ``Y`` runs after the one that +reads ``X`` and writes ``Y``). + +.. literalinclude:: ../../python/cuda_cccl/tests/stf/test_context.py + :language: python + :pyobject: test_ctx3 + :caption: Context, logical data, and tasks. `View complete source on GitHub `__ + +Context and logical data +------------------------- + +Create a **context** with :func:`context() ` (optionally +``use_graph=True`` for CUDA graph execution). All logical data and tasks belong to +one context. When you are done submitting tasks, call :meth:`finalize() ` +to run the graph and synchronize (or let the context be destroyed; ``finalize`` is +called automatically). + +**Logical data** represents a buffer that tasks access. Create it from existing +buffers or allocate new ones: + +* :meth:`logical_data(buf, ...) ` — from a NumPy array or any + object implementing the **CUDA Array Interface** (CuPy, PyTorch, Numba device arrays) + or the Python buffer protocol. For GPU arrays, pass :ref:`data_place ` + (e.g. ``data_place.device(0)``). +* :meth:`logical_data_empty(shape, dtype, ...) ` — + uninitialized allocation. +* :meth:`logical_data_full(shape, fill_value, ...) ` — + allocated and filled with a constant (like ``numpy.full()``). +* :meth:`logical_data_zeros` / :meth:`logical_data_ones` — convenience wrappers. + +Pass each logical data into a task with an access mode: :meth:`read()`, :meth:`write()`, +or :meth:`rw()`. Example: ``ctx.task(lX.read(), lY.rw())``. + +Tasks and interop +----------------- + +Use ``with ctx.task(...) as t:`` to get a task handle. Inside the block: + +* **Stream** — :meth:`t.stream_ptr() ` returns the task’s CUDA stream + (as an integer). Wrap it for your framework (e.g. ``numba.cuda.external_stream(t.stream_ptr())`` + or ``torch.cuda.ExternalStream(t.stream_ptr())``). +* **Buffer views** — :meth:`t.get_arg_cai(index) ` and + :meth:`t.args_cai() ` return object(s) that implement the + **CUDA Array Interface**, so you can pass them to Numba (``cuda.from_cuda_array_interface(...)``), + PyTorch (``torch.as_tensor(...)``), or CuPy (``cp.asarray(...)``). + +The ``cuda.stf`` package does not ship Numba/PyTorch helpers; see +`tests/stf/numba_helpers.py `_, +`numba_decorator.py `_, +and `pytorch_task.py `_ +for examples. + +Places +------ + +.. _stf-exec-place: + +* **Execution place** (``exec_place``) — where the task runs. Pass as the first + argument to ``ctx.task(...)``: ``exec_place.device(device_id)`` or ``exec_place.host()``. + Example: ``ctx.task(exec_place.device(0), lX.read(), lY.rw())``. + +.. _stf-data-place: + +* **Data place** (``data_place``) — where logical data lives: ``data_place.host()``, + ``data_place.device(device_id)``, ``data_place.managed()``. Use when creating + logical data or in a dependency, e.g. ``lZ.rw(data_place.device(1))``. + +Tokens +------ + +:meth:`ctx.token() ` creates a **token** (logical data with no buffer) +for ordering tasks without data transfer. Use ``token.read()`` or ``token.rw()`` in +task dependencies. + +Example collections +------------------- + +For runnable examples (Numba kernels, PyTorch, tokens, multi-GPU, FDTD), see the +`STF tests and examples `_. + +For the full STF programming model, graph visualization, and C++ API, see +:ref:`CUDASTF (C++) `. diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt index 922458c02e2..5816853915e 100644 --- a/python/cuda_cccl/CMakeLists.txt +++ b/python/cuda_cccl/CMakeLists.txt @@ -17,9 +17,34 @@ message( # Build cccl.c.parallel and add CCCL's install rules set(_cccl_root ../..) + +# Build and install C++ library first set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds set(CCCL_ENABLE_C_PARALLEL ON) # Build the cccl.c.parallel library +# STF C interface and Python bindings are not compatible with MSVC; disable on Windows. +if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC") + set(CCCL_ENABLE_C_EXPERIMENTAL_STF OFF) + message(STATUS "STF disabled: not compatible with MSVC") +else() + set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON) +endif() +set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features + +# Disable all testing, examples, and benchmarks - we only want the libraries +set(CCCL_ENABLE_TESTING OFF) +set(CCCL_ENABLE_EXAMPLES OFF) +set(CCCL_ENABLE_BENCHMARKS OFF) +set(CCCL_C_PARALLEL_ENABLE_TESTING OFF) +set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF) +# Note: CCCL_ENABLE_CUDAX must be ON because STF depends on it (via CCCL_ENABLE_UNSTABLE) +# But disable cudax tests, examples, and header testing +set(cudax_ENABLE_TESTING OFF) +set(cudax_ENABLE_EXAMPLES OFF) +set(cudax_ENABLE_HEADER_TESTING OFF) +set(cudax_ENABLE_CUFILE OFF) set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) +set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME}) + # Just install the rest: set(libcudacxx_ENABLE_INSTALL_RULES ON) set(CUB_ENABLE_INSTALL_RULES ON) @@ -34,10 +59,29 @@ add_subdirectory(${_cccl_root} _parent_cccl) set(CMAKE_INSTALL_LIBDIR "${old_libdir}") # pop set(CMAKE_INSTALL_INCLUDEDIR "${old_includedir}") # pop -# ensure the destination directory exists +# Create CCCL::cudax alias for STF (normally created by cccl-config.cmake) +if ( + CCCL_ENABLE_C_EXPERIMENTAL_STF + AND TARGET cudax::cudax + AND NOT TARGET CCCL::cudax +) + add_library(CCCL::cudax ALIAS cudax::cudax) +endif() + +# Ensure the destination directory exists file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl") +if (CCCL_ENABLE_C_EXPERIMENTAL_STF) + file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl") +endif() # Install version-specific binaries +if (CCCL_ENABLE_C_EXPERIMENTAL_STF) + install( + TARGETS cccl.c.experimental.stf + DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl + ) +endif() + install( TARGETS cccl.c.parallel DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl @@ -116,6 +160,7 @@ python3_add_library( WITH_SOABI "${_generated_extension_src}" ) + add_dependencies(_bindings_impl cythonize_bindings_impl) target_link_libraries( _bindings_impl @@ -126,3 +171,53 @@ target_link_libraries( set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl") install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR}) + +if (CCCL_ENABLE_C_EXPERIMENTAL_STF) + message(STATUS "STF Using Cython ${CYTHON_VERSION}") + set( + stf_pyx_source_file + "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx" + ) + set( + _stf_generated_extension_src + "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c" + ) + set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep") + add_custom_command( + OUTPUT "${_stf_generated_extension_src}" + COMMAND "${Python3_EXECUTABLE}" -m cython + ARGS + ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file + ${_stf_generated_extension_src} + DEPENDS "${stf_pyx_source_file}" + DEPFILE "${_stf_depfile}" + COMMENT "Cythonizing ${stf_pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}" + ) + set_source_files_properties( + "${_stf_generated_extension_src}" + PROPERTIES GENERATED TRUE + ) + add_custom_target( + cythonize_stf_bindings_impl + ALL + DEPENDS "${_stf_generated_extension_src}" + ) + + python3_add_library( + _stf_bindings_impl + MODULE + WITH_SOABI + "${_stf_generated_extension_src}" + ) + add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl) + target_link_libraries( + _stf_bindings_impl + PRIVATE cccl.c.experimental.stf CUDA::cuda_driver + ) + set_target_properties( + _stf_bindings_impl + PROPERTIES INSTALL_RPATH "$ORIGIN/cccl" + ) + + install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR}) +endif() diff --git a/python/cuda_cccl/cuda/stf/__init__.py b/python/cuda_cccl/cuda/stf/__init__.py new file mode 100644 index 00000000000..1e230c66699 --- /dev/null +++ b/python/cuda_cccl/cuda/stf/__init__.py @@ -0,0 +1,32 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from __future__ import annotations + +from ._stf_bindings import _BINDINGS_AVAILABLE # type: ignore[attr-defined] + +if not _BINDINGS_AVAILABLE: + __all__ = ["_BINDINGS_AVAILABLE"] + + def __getattr__(name: str): + raise AttributeError( + f"Cannot access 'cuda.stf.{name}' because CUDASTF bindings are not available. " + "This typically means you're running on a CPU-only machine without CUDA drivers installed, " + "or that cuda-cccl was not built with STF support." + ) +else: + from ._stf_bindings import ( + context, + data_place, + dep, + exec_place, + ) + + __all__ = [ + "_BINDINGS_AVAILABLE", + "context", + "dep", + "exec_place", + "data_place", + ] diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings.py b/python/cuda_cccl/cuda/stf/_stf_bindings.py new file mode 100644 index 00000000000..3fb42ca3d2c --- /dev/null +++ b/python/cuda_cccl/cuda/stf/_stf_bindings.py @@ -0,0 +1,75 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# _stf_bindings.py is a shim module that imports symbols from a +# _stf_bindings_impl extension module. The shim serves the same purposes as +# cuda.compute._bindings: +# +# 1. Import a CUDA-specific extension. The wheel ships cuda/stf/cu12/ and +# cuda/stf/cu13/; at runtime this shim chooses based on the detected CUDA +# version and imports all symbols from the matching extension. +# +# 2. Preload `nvrtc` and `nvJitLink` before importing the extension (indirect +# dependencies via cccl.c.parallel / cccl.c.experimental.stf). + +from __future__ import annotations + +import importlib + +from cuda.cccl._cuda_version_utils import detect_cuda_version, get_recommended_extra +from cuda.pathfinder import ( # type: ignore[import-not-found] + load_nvidia_dynamic_lib, +) + + +def _load_cuda_libraries(): + """ + Preload CUDA libraries to ensure proper symbol resolution. + + These libraries are indirect dependencies pulled in via cccl.c.parallel. + Preloading ensures reliable symbol resolution regardless of dynamic linker behavior. + """ + import warnings + + for libname in ("nvrtc", "nvJitLink"): + try: + load_nvidia_dynamic_lib(libname) + except Exception as e: + # Log warning but don't fail - the extension might still work + # if the libraries are already loaded or available through other means + warnings.warn( + f"Failed to preload CUDA library '{libname}': {e}. " + f"STF bindings may fail to load if {libname} is not available.", + RuntimeWarning, + stacklevel=2, + ) + + +_load_cuda_libraries() + + +# Import the appropriate bindings implementation depending on what +# CUDA version is available: +cuda_version = detect_cuda_version() +if cuda_version not in [12, 13]: + raise RuntimeError( + f"Unsupported CUDA version: {cuda_version}. Only CUDA 12 and 13 are supported." + ) + +extra_name = get_recommended_extra(cuda_version) +module_suffix = f".{extra_name}._stf_bindings_impl" + +_BINDINGS_AVAILABLE = False + +try: + bindings_module = importlib.import_module(module_suffix, __package__) + # Import all symbols from the module + globals().update(bindings_module.__dict__) + _BINDINGS_AVAILABLE = True +except ImportError as e: + import warnings + + warnings.warn( + f"CUDASTF bindings for CUDA {cuda_version} not available: {e}", + RuntimeWarning, + ) diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx new file mode 100644 index 00000000000..1f06bfde426 --- /dev/null +++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx @@ -0,0 +1,841 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# distutils: language = c++ +# cython: language_level=3 +# cython: linetrace=True + +# Python signatures are declared in the companion Python stub file _bindings.pyi +# Make sure to update PYI with change to Python API to ensure that Python +# static type checker tools like mypy green-lights cuda.cccl.parallel + +from cpython.buffer cimport ( + Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS, + PyObject_GetBuffer, PyBuffer_Release, PyObject_CheckBuffer +) +from cpython.bytes cimport PyBytes_FromStringAndSize +from cpython.pycapsule cimport ( + PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer +) +from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t +from libc.string cimport memset, memcpy + +import numpy as np + +import ctypes +from enum import IntFlag + +cdef extern from "": + cdef struct OpaqueCUstream_st + cdef struct OpaqueCUkernel_st + cdef struct OpaqueCUlibrary_st + + ctypedef int CUresult + ctypedef OpaqueCUstream_st *CUstream + ctypedef OpaqueCUkernel_st *CUkernel + ctypedef OpaqueCUlibrary_st *CUlibrary + +cdef extern from "cccl/c/experimental/stf/stf.h": + # + # Contexts + # + ctypedef struct stf_ctx_handle_t + ctypedef stf_ctx_handle_t* stf_ctx_handle + void stf_ctx_create(stf_ctx_handle* ctx) + void stf_ctx_create_graph(stf_ctx_handle* ctx) + void stf_ctx_finalize(stf_ctx_handle ctx) + + # + # Exec places + # + ctypedef enum stf_exec_place_kind: + STF_EXEC_PLACE_DEVICE + STF_EXEC_PLACE_HOST + + ctypedef struct stf_exec_place_device: + int dev_id + + ctypedef struct stf_exec_place_host: + int dummy + + ctypedef union stf_exec_place_u: + stf_exec_place_device device + stf_exec_place_host host + + ctypedef struct stf_exec_place: + stf_exec_place_kind kind + stf_exec_place_u u + + stf_exec_place make_device_place(int dev_id) + stf_exec_place make_host_place() + + # + # Data places + # + ctypedef enum stf_data_place_kind: + STF_DATA_PLACE_DEVICE + STF_DATA_PLACE_HOST + STF_DATA_PLACE_MANAGED + STF_DATA_PLACE_AFFINE + + ctypedef struct stf_data_place_device: + int dev_id + + ctypedef struct stf_data_place_host: + int dummy + + ctypedef struct stf_data_place_managed: + int dummy + + ctypedef struct stf_data_place_affine: + int dummy + + ctypedef union stf_data_place_u: + stf_data_place_device device + stf_data_place_host host + stf_data_place_managed managed + stf_data_place_affine affine + + ctypedef struct stf_data_place: + stf_data_place_kind kind + stf_data_place_u u + + stf_data_place make_device_data_place(int dev_id) + stf_data_place make_host_data_place() + stf_data_place make_managed_data_place() + stf_data_place make_affine_data_place() + + ctypedef struct stf_logical_data_handle_t + ctypedef stf_logical_data_handle_t* stf_logical_data_handle + void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz) + void stf_logical_data_with_place(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace) + void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol) + void stf_logical_data_destroy(stf_logical_data_handle ld) + void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to) + + void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld); + + ctypedef struct stf_task_handle_t + ctypedef stf_task_handle_t* stf_task_handle + void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t) + void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p) + void stf_task_set_symbol(stf_task_handle t, const char* symbol) + void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m) + void stf_task_add_dep_with_dplace(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p) + void stf_task_start(stf_task_handle t) + void stf_task_end(stf_task_handle t) + void stf_task_enable_capture(stf_task_handle t) + CUstream stf_task_get_custream(stf_task_handle t) + void* stf_task_get(stf_task_handle t, int submitted_index) + void stf_task_destroy(stf_task_handle t) + + cdef enum stf_access_mode: + STF_NONE + STF_READ + STF_WRITE + STF_RW + +class AccessMode(IntFlag): + NONE = STF_NONE + READ = STF_READ + WRITE = STF_WRITE + RW = STF_RW + +class stf_cai: + """ + Wrapper that exposes __cuda_array_interface__ for interop (torch, cupy, etc.). + Supports dict-style access (e.g. obj['data']) for code that expects a CAI dict. + """ + def __init__(self, ptr, tuple shape, dtype, stream=0): + self.ptr = ptr # integer device pointer + self.shape = shape + self.dtype = np.dtype(dtype) + self.stream = stream # CUDA stream handle (int or 0) + self.__cuda_array_interface__ = { + 'version': 2, + 'shape': self.shape, + 'typestr': self.dtype.str, # e.g., 'data_ptr, self._len, dplace._c_place) + + else: + # Fallback to Python buffer protocol + flags = PyBUF_FORMAT | PyBUF_ND # request dtype + shape + + if PyObject_GetBuffer(buf, &view, flags) != 0: + raise ValueError("object doesn't support the full buffer protocol or __cuda_array_interface__") + + try: + self._ndim = view.ndim + self._len = view.len + self._shape = tuple(view.shape[i] for i in range(view.ndim)) + self._dtype = np.dtype(view.format) + # For buffer protocol objects, use the specified data place (defaults to host) + stf_logical_data_with_place(ctx._ctx, &self._ld, view.buf, view.len, dplace._c_place) + + finally: + PyBuffer_Release(&view) + + # Apply symbol name if provided + if name is not None: + self.set_symbol(name) + + + def set_symbol(self, str name): + stf_logical_data_set_symbol(self._ld, name.encode()) + self._symbol = name # Store locally for retrieval + + @property + def symbol(self): + """Get the symbol name of this logical data, if set.""" + return self._symbol + + def __dealloc__(self): + if self._ld != NULL: + stf_logical_data_destroy(self._ld) + self._ld = NULL + + def __repr__(self): + """Return a detailed string representation of the logical_data object.""" + return (f"logical_data(shape={self._shape}, dtype={self._dtype}, " + f"is_token={self._is_token}, symbol={self._symbol!r}, " + f"len={self._len}, ndim={self._ndim})") + + @property + def dtype(self): + """Return the dtype of the logical data.""" + return self._dtype + + @property + def shape(self): + """Return the shape of the logical data.""" + return self._shape + + def read(self, dplace=None): + return dep(self, AccessMode.READ.value, dplace) + + def write(self, dplace=None): + return dep(self, AccessMode.WRITE.value, dplace) + + def rw(self, dplace=None): + return dep(self, AccessMode.RW.value, dplace) + + def empty_like(self): + """ + Create a new logical_data with the same shape (and dtype metadata) + as this object. + """ + if self._ld == NULL: + raise RuntimeError("source logical_data handle is NULL") + + cdef logical_data out = logical_data.__new__(logical_data) + stf_logical_data_empty(self._ctx, self._len, &out._ld) + out._ctx = self._ctx + out._dtype = self._dtype + out._shape = self._shape + out._ndim = self._ndim + out._len = self._len + out._symbol = None # New object has no symbol initially + out._is_token = False + + return out + + @staticmethod + def token(context ctx): + cdef logical_data out = logical_data.__new__(logical_data) + out._ctx = ctx._ctx + out._dtype = None + out._shape = None + out._ndim = 0 + out._len = 0 + out._symbol = None # New object has no symbol initially + out._is_token = True + stf_token(ctx._ctx, &out._ld) + + return out + + @staticmethod + def init_by_shape(context ctx, shape, dtype, str name=None): + """ + Create a new logical_data from a shape and a dtype. + """ + cdef logical_data out = logical_data.__new__(logical_data) + out._ctx = ctx._ctx + out._dtype = np.dtype(dtype) + out._shape = shape + out._ndim = len(shape) + cdef size_t total_items = 1 + for dim in shape: + total_items *= dim + out._len = total_items * out._dtype.itemsize + out._symbol = None + out._is_token = False + stf_logical_data_empty(ctx._ctx, out._len, &out._ld) + + if name is not None: + out.set_symbol(name) + + return out + + def borrow_ctx_handle(self): + ctx = context(borrowed=True) + ctx.borrow_from_handle(self._ctx) + return ctx + +class dep: + __slots__ = ("ld", "mode", "dplace") + def __init__(self, logical_data ld, int mode, dplace=None): + self.ld = ld + self.mode = mode + self.dplace = dplace # can be None or a data place + def __iter__(self): # nice unpacking support + yield self.ld + yield self.mode + yield self.dplace + def __repr__(self): + return f"dep({self.ld!r}, {self.mode}, {self.dplace!r})" + def get_ld(self): + return self.ld + +def read(ld, dplace=None): return dep(ld, AccessMode.READ.value, dplace) +def write(ld, dplace=None): return dep(ld, AccessMode.WRITE.value, dplace) +def rw(ld, dplace=None): return dep(ld, AccessMode.RW.value, dplace) + +cdef class exec_place: + cdef stf_exec_place _c_place + + def __cinit__(self): + # empty default constructor; never directly used + pass + + @staticmethod + def device(int dev_id): + cdef exec_place p = exec_place.__new__(exec_place) + p._c_place = make_device_place(dev_id) + return p + + @staticmethod + def host(): + cdef exec_place p = exec_place.__new__(exec_place) + p._c_place = make_host_place() + return p + + @property + def kind(self) -> str: + return ("device" if self._c_place.kind == STF_EXEC_PLACE_DEVICE + else "host") + + @property + def device_id(self) -> int: + if self._c_place.kind != STF_EXEC_PLACE_DEVICE: + raise AttributeError("not a device execution place") + return self._c_place.u.device.dev_id + +cdef class data_place: + cdef stf_data_place _c_place + + def __cinit__(self): + # empty default constructor; never directly used + pass + + @staticmethod + def device(int dev_id): + cdef data_place p = data_place.__new__(data_place) + p._c_place = make_device_data_place(dev_id) + return p + + @staticmethod + def host(): + cdef data_place p = data_place.__new__(data_place) + p._c_place = make_host_data_place() + return p + + @staticmethod + def managed(): + cdef data_place p = data_place.__new__(data_place) + p._c_place = make_managed_data_place() + return p + + @staticmethod + def affine(): + cdef data_place p = data_place.__new__(data_place) + p._c_place = make_affine_data_place() + return p + + @property + def kind(self) -> str: + cdef stf_data_place_kind k = self._c_place.kind + if k == STF_DATA_PLACE_DEVICE: + return "device" + elif k == STF_DATA_PLACE_HOST: + return "host" + elif k == STF_DATA_PLACE_MANAGED: + return "managed" + elif k == STF_DATA_PLACE_AFFINE: + return "affine" + else: + raise ValueError(f"Unknown data place kind: {k}") + + @property + def device_id(self) -> int: + if self._c_place.kind != STF_DATA_PLACE_DEVICE: + raise AttributeError("not a device data place") + return self._c_place.u.device.dev_id + + + +cdef class task: + cdef stf_task_handle _t + + # list of logical data in deps: we need this because we can't exchange + # dtype/shape easily through the C API of STF + cdef list _lds_args + + def __cinit__(self, context ctx): + stf_task_create(ctx._ctx, &self._t) + self._lds_args = [] + + def __dealloc__(self): + if self._t != NULL: + stf_task_destroy(self._t) + + def start(self): + # This is ignored if this is not a graph task + stf_task_enable_capture(self._t) + + stf_task_start(self._t) + + def end(self): + stf_task_end(self._t) + + def add_dep(self, object d): + """ + Accept a `dep` instance created with read(ld), write(ld), or rw(ld). + """ + if not isinstance(d, dep): + raise TypeError("add_dep expects read(ld), write(ld) or rw(ld)") + + cdef logical_data ldata = d.ld + cdef int mode_int = int(d.mode) + cdef stf_access_mode mode_ce = mode_int + cdef data_place dp + + if d.dplace is None: + stf_task_add_dep(self._t, ldata._ld, mode_ce) + else: + dp = d.dplace + stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dp._c_place) + + self._lds_args.append(ldata) + + def set_exec_place(self, object exec_p): + if not isinstance(exec_p, exec_place): + raise TypeError("set_exec_place expects and exec_place argument") + + cdef exec_place ep = exec_p + stf_task_set_exec_place(self._t, &ep._c_place) + + def stream_ptr(self) -> int: + """ + Return the raw CUstream pointer as a Python int + (memory address). Suitable for ctypes or PyCUDA. + """ + cdef CUstream s = stf_task_get_custream(self._t) + return s # cast pointer -> Py int + + def get_arg(self, index) -> int: + if self._lds_args[index]._is_token: + raise RuntimeError("cannot materialize a token argument") + + cdef void *ptr = stf_task_get(self._t, index) + return ptr + + def get_arg_cai(self, index): + """Return the argument as an stf_cai object (has __cuda_array_interface__; supports obj['data'] etc.). + The underlying memory is owned by the task/context; keep the task (or context) alive while using the returned view.""" + ptr = self.get_arg(index) + return stf_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=self.stream_ptr()) + + def args_cai(self): + """ + Return all non-token buffer arguments as stf_cai objects (have __cuda_array_interface__). + Returns None, a single object, or a tuple. Use from non-shipped code (e.g. tests) to + convert to numba/torch/cupy via from_cuda_array_interface or torch.as_tensor(obj). + Keep the task (or context) alive while any consumer uses the returned view(s). + """ + non_token_cais = [self.get_arg_cai(i) for i in range(len(self._lds_args)) + if not self._lds_args[i]._is_token] + + if len(non_token_cais) == 0: + return None + elif len(non_token_cais) == 1: + return non_token_cais[0] + return tuple(non_token_cais) + + # ---- context‑manager helpers ------------------------------- + def __enter__(self): + self.start() + return self + + def __exit__(self, object exc_type, object exc, object tb): + """ + Always called, even if an exception occurred inside the block. + """ + self.end() + return False + +cdef class context: + cdef stf_ctx_handle _ctx + # Is this a context that we have borrowed ? + cdef bint _borrowed + + def __cinit__(self, bint use_graph=False, bint borrowed=False): + self._ctx = NULL + self._borrowed = borrowed + if not borrowed: + if use_graph: + stf_ctx_create_graph(&self._ctx) + else: + stf_ctx_create(&self._ctx) + + cdef borrow_from_handle(self, stf_ctx_handle ctx_handle): + if self._ctx != NULL: + raise RuntimeError("context already initialized") + + if not self._borrowed: + raise RuntimeError("cannot call borrow_from_handle on this context") + + self._ctx = ctx_handle + + def __repr__(self): + return f"context(handle={self._ctx}, borrowed={self._borrowed})" + + def __dealloc__(self): + if not self._borrowed: + self.finalize() + + def finalize(self): + if self._borrowed: + raise RuntimeError("cannot finalize borrowed context") + + if self._ctx != NULL: + stf_ctx_finalize(self._ctx) + self._ctx = NULL + + def logical_data(self, object buf, data_place dplace=None, str name=None): + """ + Create and return a `logical_data` object bound to this context [PRIMARY API]. + + This is the primary function for creating logical data from existing buffers. + It supports both Python buffer protocol objects and CUDA Array Interface objects, + with explicit data_place specification for optimal STF data movement strategies. + + Parameters + ---------- + buf : any buffer‑supporting Python object or __cuda_array_interface__ object + (NumPy array, Warp array, CuPy array, bytes, bytearray, memoryview, …) + dplace : data_place, optional + Specifies where the buffer is located (host, device, managed, affine). + Defaults to data_place.host() for backward compatibility. + Essential for GPU arrays - use data_place.device() for optimal performance. + name : str, optional + Symbol name for debugging and DOT graph output. + + Examples + -------- + >>> # Host memory (explicit - recommended) + >>> host_place = data_place.host() + >>> ld = ctx.logical_data(numpy_array, host_place) + >>> + >>> # GPU device memory (recommended for CUDA arrays) + >>> device_place = data_place.device(0) + >>> ld = ctx.logical_data(warp_array, device_place) + >>> + >>> # With a symbol name for debugging + >>> ld = ctx.logical_data(numpy_array, name="X") + >>> + >>> # Backward compatibility (defaults to host) + >>> ld = ctx.logical_data(numpy_array) # Same as specifying host + + Note + ---- + For GPU arrays (Warp, CuPy, etc.), always specify data_place.device() + for zero-copy performance and correct memory management. + """ + return logical_data(self, buf, dplace, name=name) + + + def logical_data_empty(self, shape, dtype=None, str name=None): + """ + Create logical data with uninitialized values. + + Equivalent to numpy.empty() but for STF logical data. + + Parameters + ---------- + shape : tuple + Shape of the array + dtype : numpy.dtype, optional + Data type. Defaults to np.float64. + name : str, optional + Symbol name for debugging and DOT graph output. + + Returns + ------- + logical_data + New logical data with uninitialized values + + Examples + -------- + >>> # Create uninitialized array (fast but contains garbage) + >>> ld = ctx.logical_data_empty((100, 100), dtype=np.float32) + + >>> # Fast allocation without initialization + >>> ld = ctx.logical_data_empty((50, 50, 50), name="tmp") + """ + if dtype is None: + dtype = np.float64 + return logical_data.init_by_shape(self, shape, dtype, name) + + def logical_data_full(self, shape, fill_value, dtype=None, where=None, exec_place=None, str name=None): + """ + Create logical data initialized with a constant value. + + Similar to numpy.full(), this creates a new logical data with the given + shape and fills it with fill_value. + + Parameters + ---------- + shape : tuple + Shape of the array + fill_value : scalar + Value to fill the array with + dtype : numpy.dtype, optional + Data type. If None, infer from fill_value. + where : data_place, optional + Data placement for initialization. Defaults to current device. + exec_place : exec_place, optional + Execution place for the fill operation. Defaults to current device. + Note: exec_place.host() is not yet supported. + name : str, optional + Symbol name for debugging and DOT graph output. + + Returns + ------- + logical_data + New logical data initialized with fill_value + + Examples + -------- + >>> # Create array filled with epsilon0 on current device + >>> ld = ctx.logical_data_full((100, 100), 8.85e-12, dtype=np.float64) + + >>> # Create array on host memory + >>> ld = ctx.logical_data_full((50, 50), 1.0, where=data_place.host()) + + >>> # With a symbol name + >>> ld = ctx.logical_data_full((200, 200), 0.0, name="epsilon") + """ + # Infer dtype from fill_value if not provided + if dtype is None: + dtype = np.array(fill_value).dtype + else: + dtype = np.dtype(dtype) + + # Validate exec_place - host execution not yet supported + if exec_place is not None: + if hasattr(exec_place, 'kind') and exec_place.kind == "host": + raise NotImplementedError( + "exec_place.host() is not yet supported for logical_data_full. " + "Use exec_place.device() or omit exec_place parameter." + ) + + # Create empty logical data + ld = self.logical_data_empty(shape, dtype, name) + + # Initialize with the specified value (cuda.core.Buffer.fill; CuPy/Numba fallback for 8-byte) + try: + from cuda.stf.fill_utils import init_logical_data + init_logical_data(self, ld, fill_value, where, exec_place) + except ImportError as e: + raise RuntimeError("Fill support (cuda.core) is not available for logical_data_full") from e + + return ld + + def logical_data_zeros(self, shape, dtype=None, where=None, exec_place=None, str name=None): + """ + Create logical data filled with zeros. + + Equivalent to numpy.zeros() but for STF logical data. + + Parameters + ---------- + shape : tuple + Shape of the array + dtype : numpy.dtype, optional + Data type. Defaults to np.float64. + where : data_place, optional + Data placement. Defaults to current device. + exec_place : exec_place, optional + Execution place for the fill operation. Defaults to current device. + name : str, optional + Symbol name for debugging and DOT graph output. + + Returns + ------- + logical_data + New logical data filled with zeros + + Examples + -------- + >>> # Create zero-filled array + >>> ld = ctx.logical_data_zeros((100, 100), dtype=np.float32) + + >>> # Create on host memory with a name + >>> ld = ctx.logical_data_zeros((50, 50), where=data_place.host(), name="Z") + """ + if dtype is None: + dtype = np.float64 + return self.logical_data_full(shape, 0.0, dtype, where, exec_place, name) + + def logical_data_ones(self, shape, dtype=None, where=None, exec_place=None, str name=None): + """ + Create logical data filled with ones. + + Equivalent to numpy.ones() but for STF logical data. + + Parameters + ---------- + shape : tuple + Shape of the array + dtype : numpy.dtype, optional + Data type. Defaults to np.float64. + where : data_place, optional + Data placement. Defaults to current device. + exec_place : exec_place, optional + Execution place for the fill operation. Defaults to current device. + name : str, optional + Symbol name for debugging and DOT graph output. + + Returns + ------- + logical_data + New logical data filled with ones + + Examples + -------- + >>> # Create ones-filled array + >>> ld = ctx.logical_data_ones((100, 100), dtype=np.float32) + + >>> # Create on specific device with a name + >>> ld = ctx.logical_data_ones((50, 50), name="ones") + """ + if dtype is None: + dtype = np.float64 + return self.logical_data_full(shape, 1.0, dtype, where, exec_place, name) + + def token(self): + return logical_data.token(self) + + def task(self, *args): + """ + Create a `task` + + Example + ------- + >>> t = ctx.task(read(lX), rw(lY)) + >>> t.start() + >>> t.end() + """ + exec_place_set = False + t = task(self) # construct with this context + for d in args: + if isinstance(d, dep): + t.add_dep(d) + elif isinstance(d, exec_place): + if exec_place_set: + raise ValueError("Only one exec_place can be given") + t.set_exec_place(d) + exec_place_set = True + else: + raise TypeError( + "Arguments must be dependency objects or an exec_place" + ) + return t diff --git a/python/cuda_cccl/cuda/stf/fill_utils.py b/python/cuda_cccl/cuda/stf/fill_utils.py new file mode 100644 index 00000000000..9d27f01342b --- /dev/null +++ b/python/cuda_cccl/cuda/stf/fill_utils.py @@ -0,0 +1,83 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Fill / init for STF logical data. Uses cuda.core.Buffer.fill for 1/2/4-byte; +single fallback (CuPy) for 8-byte. +""" + +import numpy as np + +from cuda.core import Buffer, Stream + + +def init_logical_data(ctx, ld, value, data_place=None, exec_place=None): + """ + Initialize a logical data with a constant value. + + Uses cuda.core.Buffer.fill for 1/2/4-byte element types. For 8-byte types + (e.g. float64, int64) uses CuPy if available; otherwise raises. + + Parameters + ---------- + ctx : context + STF context + ld : logical_data + Logical data to initialize + value : scalar + Value to fill the array with + data_place : data_place, optional + Data place for the initialization task + exec_place : exec_place, optional + Execution place for the fill operation + + Raises + ------ + ImportError + If dtype is 8-byte and CuPy is not installed. + """ + dep_arg = ld.write(data_place) if data_place else ld.write() + + task_args = [] + if exec_place is not None: + task_args.append(exec_place) + task_args.append(dep_arg) + + with ctx.task(*task_args) as t: + ld_index = 1 if exec_place is not None else 0 + cai = t.get_arg_cai(ld_index) + ptr = cai["data"][0] + shape = tuple(cai["shape"]) + dtype = np.dtype(cai["typestr"]) + size = int(np.prod(shape)) * dtype.itemsize + + core_stream = Stream.from_handle(t.stream_ptr()) + buf = Buffer.from_handle(ptr, size, owner=None) + + if dtype.itemsize in (1, 2, 4): + if value == 0 or value == 0.0: + fill_val = 0 + else: + fill_val = np.array([value], dtype=dtype).tobytes() + buf.fill(fill_val, stream=core_stream) + else: + # 8-byte: single fallback via CuPy + _fill_8byte_cupy(shape, dtype, value, ptr, size, t.stream_ptr()) + + +def _fill_8byte_cupy(shape, dtype, value, ptr, size, stream_ptr): + """Fill 8-byte buffer using CuPy. Raises ImportError if CuPy not available.""" + try: + import cupy as cp + except ImportError: + raise ImportError( + "Fill for 8-byte dtypes (e.g. float64) requires CuPy. " + "Install CuPy or use a 1/2/4-byte dtype (e.g. np.float32)." + ) from None + + mem = cp.cuda.UnownedMemory(ptr, size, owner=None) + memptr = cp.cuda.MemoryPointer(mem, 0) + arr = cp.ndarray(shape, dtype=dtype, memptr=memptr) + with cp.cuda.ExternalStream(stream_ptr): + arr.fill(value) diff --git a/python/cuda_cccl/merge_cuda_wheels.py b/python/cuda_cccl/merge_cuda_wheels.py index 71ead1ebbb9..0246d83ac16 100755 --- a/python/cuda_cccl/merge_cuda_wheels.py +++ b/python/cuda_cccl/merge_cuda_wheels.py @@ -5,14 +5,13 @@ This script takes wheels built for different CUDA versions (cu12, cu13) and merges them into a single wheel that supports both CUDA versions. -In particular, each wheel contains a CUDA-specific build of the `cccl.c.parallel` library -and the associated bindings. These are present in the directory `compute/cu`. -For example, for a wheel built with CUDA 12, the directory is `compute/cu12`, -and for a wheel built with CUDA 13, the directory is `compute/cu13`. -This script merges these directories into a single wheel that supports both CUDA versions, i.e., -containing both `compute/cu12` and `compute/cu13`. -At runtime, a shim module `compute/_bindings.py` is used to import the appropriate -CUDA-specific bindings. See `compute/_bindings.py` for more details. +Each wheel contains CUDA-specific builds in versioned directories: +- `cuda/compute/cu` — cccl.c.parallel and compute bindings +- `cuda/stf/cu` — cccl.c.experimental.stf and STF bindings + +This script merges these directories so the final wheel contains both cu12 and cu13 +for compute and for stf. At runtime, shim modules choose the right extension from +the detected CUDA version. """ import argparse @@ -100,18 +99,19 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: # Use the first wheel as the base and merge binaries from others base_wheel = extracted_wheels[0] - # now copy the version-specific directory from other wheels - # into the appropriate place in the base wheel + # Copy version-specific directories (compute and stf) from other wheels into base for i, wheel_dir in enumerate(extracted_wheels): - cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] if i == 0: - # For base wheel, do nothing continue - else: - version_dir = Path("cuda") / "compute" / f"cu{cuda_version}" - # Copy from other wheels + cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] + for subpackage in ("compute", "stf"): + version_dir = Path("cuda") / subpackage / f"cu{cuda_version}" + src = wheel_dir / version_dir + if not src.is_dir(): + continue + dst = base_wheel / version_dir print(f" Copying {version_dir} to {base_wheel}") - shutil.copytree(wheel_dir / version_dir, base_wheel / version_dir) + shutil.copytree(src, dst) # Repack the merged wheel output_dir.mkdir(parents=True, exist_ok=True) diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml index b5fad006ba9..a85b3eae170 100644 --- a/python/cuda_cccl/pyproject.toml +++ b/python/cuda_cccl/pyproject.toml @@ -129,6 +129,7 @@ known-first-party = [ "cuda.cccl.headers", "cuda.compute", "cuda.coop", + "cuda.stf", ] [tool.pytest.ini_options] diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py new file mode 100755 index 00000000000..b89043ed577 --- /dev/null +++ b/python/cuda_cccl/tests/stf/example_cholesky.py @@ -0,0 +1,711 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Python implementation of Cholesky decomposition using CUDA STF and CuPy (CUBLAS/CUSOLVER). + +This example demonstrates: +- Tiled matrix operations with STF logical data +- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks +- Multi-device execution with automatic data placement +- Task-based parallelism for linear algebra operations + +Note: CUDASTF automatically manages device context within tasks via exec_place.device(). +There's no need to manually set the current device in task bodies - just use the STF stream. +""" + +import sys + +import cupy as cp +import numpy as np +from cupyx.scipy import linalg as cp_linalg + +import cuda.stf as stf + + +class CAIWrapper: + """Wrapper to expose CUDA Array Interface dict as a proper CAI object.""" + + def __init__(self, cai_dict): + self.__cuda_array_interface__ = cai_dict + + +def get_cupy_arrays(task): + """ + Get all CuPy arrays from STF task arguments. + + Usage: + d_a, d_b, d_c = get_cupy_arrays(t) + """ + arrays = [] + idx = 0 + while True: + try: + arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx)))) + idx += 1 + except Exception: + break + return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None + + +def cai_to_numpy(cai_dict): + """Convert CUDA Array Interface dict to NumPy array (for host memory).""" + import ctypes + + # Extract CAI fields + data_ptr, readonly = cai_dict["data"] + shape = cai_dict["shape"] + typestr = cai_dict["typestr"] + + # Convert typestr to NumPy dtype + dtype = np.dtype(typestr) + + # Calculate total size in bytes + itemsize = dtype.itemsize + size = np.prod(shape) * itemsize + + # Create ctypes buffer from pointer + buffer = (ctypes.c_byte * size).from_address(data_ptr) + + # Create NumPy array from buffer + arr = np.frombuffer(buffer, dtype=dtype).reshape(shape) + + return arr + + +class BlockRef: + """Reference to a specific block in a tiled matrix.""" + + def __init__(self, matrix, row, col): + self.matrix = matrix + self.row = row + self.col = col + self._handle = matrix.handle(row, col) + self._devid = matrix.get_preferred_devid(row, col) + + def handle(self): + """Get the STF logical data handle for this block.""" + return self._handle + + def devid(self): + """Get the preferred device ID for this block.""" + return self._devid + + def __repr__(self): + return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])" + + +class TiledMatrix: + """ + Tiled matrix class that splits a matrix into blocks for parallel processing. + Each block is managed as an STF logical data object. + """ + + def __init__( + self, + ctx, + nrows, + ncols, + block_rows, + block_cols, + is_symmetric=False, + symbol="matrix", + dtype=np.float64, + ): + """ + Initialize a tiled matrix. + + Args: + ctx: STF context + nrows: Total number of rows + ncols: Total number of columns + block_rows: Block size (rows) + block_cols: Block size (columns) + is_symmetric: If True, only stores lower triangular blocks + symbol: Name/symbol for the matrix + dtype: Data type (default: np.float64) + """ + self.ctx = ctx + self.symbol = symbol + self.dtype = dtype + + self.m = nrows + self.n = ncols + self.mb = block_rows + self.nb = block_cols + self.sym_matrix = is_symmetric + + assert self.m % self.mb == 0, ( + f"nrows ({self.m}) must be divisible by block_rows ({self.mb})" + ) + assert self.n % self.nb == 0, ( + f"ncols ({self.n}) must be divisible by block_cols ({self.nb})" + ) + + # Number of blocks + self.mt = self.m // self.mb + self.nt = self.n // self.nb + + # Allocate host memory (pinned for faster transfers) + self.h_array = cp.cuda.alloc_pinned_memory( + self.m * self.n * np.dtype(dtype).itemsize + ) + self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape( + self.m, self.n + ) + + # Create logical data handles for each block + self.handles = {} + + # Get available devices for mapping + self.ndevs = cp.cuda.runtime.getDeviceCount() + self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs) + + print( + f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}" + ) + print( + f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid" + ) + + # Note: We DON'T create logical data here yet - that happens in fill() + # after the host data is initialized + + def _compute_device_grid(self, ndevs): + """Compute 2D device grid dimensions (as close to square as possible)""" + grid_p = 1 + grid_q = ndevs + for a in range(1, int(np.sqrt(ndevs)) + 1): + if ndevs % a == 0: + grid_p = a + grid_q = ndevs // a + return grid_p, grid_q + + def get_preferred_devid(self, row, col): + """Get preferred device ID for a given block using cyclic distribution""" + return (row % self.grid_p) + (col % self.grid_q) * self.grid_p + + def handle(self, row, col): + """Get the logical data handle for block (row, col)""" + return self.handles[(row, col)] + + def block(self, row, col): + """Get a BlockRef for block (row, col)""" + return BlockRef(self, row, col) + + def _get_index(self, row, col): + """Convert (row, col) to linear index in tiled storage""" + # Find which tile contains this element + tile_row = row // self.mb + tile_col = col // self.nb + + tile_size = self.mb * self.nb + + # Index of the beginning of the tile + tile_start = (tile_row + self.mt * tile_col) * tile_size + + # Offset within the tile + offset = (row % self.mb) + (col % self.nb) * self.mb + + return tile_start + offset + + def _get_block_h(self, brow, bcol): + """Get a view of the host data for block (brow, bcol)""" + # For tiled storage, blocks are stored contiguously + start_idx = (brow + self.mt * bcol) * self.mb * self.nb + end_idx = start_idx + self.mb * self.nb + flat_view = self.h_array_np.ravel() + return flat_view[start_idx:end_idx].reshape(self.mb, self.nb) + + def fill(self, func): + """Fill matrix on host, then create STF logical data that will transfer automatically""" + print(f"[{self.symbol}] Filling matrix on host...") + + for colb in range(self.nt): + low_rowb = colb if self.sym_matrix else 0 + for rowb in range(low_rowb, self.mt): + # Fill host block + h_block = self._get_block_h(rowb, colb) + for lrow in range(self.mb): + for lcol in range(self.nb): + row = lrow + rowb * self.mb + col = lcol + colb * self.nb + h_block[lrow, lcol] = func(row, col) + + handle = self.ctx.logical_data( + h_block, name=f"{self.symbol}_{rowb}_{colb}" + ) + self.handles[(rowb, colb)] = handle + + +# BLAS/LAPACK operations wrapped in STF tasks + + +def DPOTRF(ctx, a): + """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + d_block[:] = cp.linalg.cholesky(d_block) + + +def DTRSM(ctx, a, b, side="L", uplo="L", transa="T", diag="N", alpha=1.0): + """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}""" + with ctx.task( + stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw() + ) as t: + d_a, d_b = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + if side == "L": + if transa == "N": + d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == "L")) + else: + d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != "L")) + if alpha != 1.0: + d_b *= alpha + else: + if transa == "N": + d_b[:] = cp_linalg.solve_triangular( + d_a.T, d_b.T, lower=(uplo != "L") + ).T + else: + d_b[:] = cp_linalg.solve_triangular( + d_a, d_b.T, lower=(uplo == "L") + ).T + if alpha != 1.0: + d_b *= alpha + + +def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0): + """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C""" + with ctx.task( + stf.exec_place.device(c.devid()), + a.handle().read(), + b.handle().read(), + c.handle().rw(), + ) as t: + d_a, d_b, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if transa != "N" else d_a + op_b = d_b.T if transb != "N" else d_b + + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_b) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_b) + else: + d_c[:] = alpha * (op_a @ op_b) + beta * d_c + + +def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0): + """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" + with ctx.task( + stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw() + ) as t: + d_a, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if trans != "N" else d_a + + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_a.T) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_a.T) + else: + d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c + + +# High-level algorithms + + +def PDPOTRF(ctx, A): + """Parallel tiled Cholesky factorization (blocked algorithm)""" + print("\n[PDPOTRF] Starting Cholesky factorization...") + + assert A.m == A.n, "Matrix must be square" + assert A.mt == A.nt, "Block grid must be square" + assert A.sym_matrix, "Matrix must be symmetric" + + nblocks = A.mt + + for k in range(nblocks): + # Factor diagonal block + DPOTRF(ctx, A.block(k, k)) + + # Solve triangular systems for blocks in column k + for row in range(k + 1, nblocks): + DTRSM( + ctx, + A.block(k, k), + A.block(row, k), + side="R", + uplo="L", + transa="T", + diag="N", + alpha=1.0, + ) + + # Update trailing matrix + for col in range(k + 1, row): + DGEMM( + ctx, + A.block(row, k), + A.block(col, k), + A.block(row, col), + transa="N", + transb="T", + alpha=-1.0, + beta=1.0, + ) + + # Symmetric rank-k update of diagonal block + DSYRK( + ctx, + A.block(row, k), + A.block(row, row), + uplo="L", + trans="N", + alpha=-1.0, + beta=1.0, + ) + + print("[PDPOTRF] Completed") + + +def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0): + """Parallel tiled triangular solve""" + print("\n[PDTRSM] Starting triangular solve...") + + if side == "L": + if uplo == "L": + if trans == "N": + # Forward substitution + for k in range(B.mt): + lalpha = alpha if k == 0 else 1.0 + for n in range(B.nt): + DTRSM( + ctx, + A.block(k, k), + B.block(k, n), + side="L", + uplo="L", + transa="N", + diag=diag, + alpha=lalpha, + ) + for m in range(k + 1, B.mt): + for n in range(B.nt): + DGEMM( + ctx, + A.block(m, k), + B.block(k, n), + B.block(m, n), + transa="N", + transb="N", + alpha=-1.0, + beta=lalpha, + ) + else: # trans == 'T' or 'C' + # Backward substitution + for k in range(B.mt): + lalpha = alpha if k == 0 else 1.0 + row_idx = B.mt - k - 1 + for n in range(B.nt): + DTRSM( + ctx, + A.block(row_idx, row_idx), + B.block(row_idx, n), + side="L", + uplo="L", + transa="T", + diag=diag, + alpha=lalpha, + ) + for m in range(k + 1, B.mt): + m_idx = B.mt - 1 - m + for n in range(B.nt): + DGEMM( + ctx, + A.block(row_idx, m_idx), + B.block(row_idx, n), + B.block(m_idx, n), + transa="T", + transb="N", + alpha=-1.0, + beta=lalpha, + ) + + print("[PDTRSM] Completed") + + +def PDPOTRS(ctx, A, B, uplo="L"): + """Solve A @ X = B where A is factored by Cholesky (A = L @ L.T)""" + print("\n[PDPOTRS] Solving linear system...") + + # First solve: L @ Y = B + PDTRSM( + ctx, + A, + B, + side="L", + uplo=uplo, + trans="N" if uplo == "L" else "T", + diag="N", + alpha=1.0, + ) + + # Second solve: L.T @ X = Y + PDTRSM( + ctx, + A, + B, + side="L", + uplo=uplo, + trans="T" if uplo == "L" else "N", + diag="N", + alpha=1.0, + ) + + print("[PDPOTRS] Completed") + + +def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): + """Parallel tiled matrix multiplication""" + print("\n[PDGEMM] Starting matrix multiplication...") + + for m in range(C.mt): + for n in range(C.nt): + inner_k = A.nt if transa == "N" else A.mt + + if alpha == 0.0 or inner_k == 0: + # Just scale C + DGEMM( + ctx, + A.block(0, 0), + B.block(0, 0), + C.block(m, n), + transa=transa, + transb=transb, + alpha=0.0, + beta=beta, + ) + elif transa == "N": + if transb == "N": + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(m, k), + B.block(k, n), + C.block(m, n), + transa="N", + transb="N", + alpha=alpha, + beta=zbeta, + ) + else: + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(m, k), + B.block(n, k), + C.block(m, n), + transa="N", + transb="T", + alpha=alpha, + beta=zbeta, + ) + else: + if transb == "N": + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(k, m), + B.block(k, n), + C.block(m, n), + transa="T", + transb="N", + alpha=alpha, + beta=zbeta, + ) + else: + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(k, m), + B.block(n, k), + C.block(m, n), + transa="T", + transb="T", + alpha=alpha, + beta=zbeta, + ) + + print("[PDGEMM] Completed") + + +def compute_norm(ctx, matrix): + """Compute Frobenius norm of matrix using host tasks""" + norm_sq = 0.0 + + for colb in range(matrix.nt): + low_rowb = colb if matrix.sym_matrix else 0 + for rowb in range(low_rowb, matrix.mt): + handle = matrix.handle(rowb, colb) + + # Host task to read the block and compute norm + def compute_block_norm(h_block): + nonlocal norm_sq + norm_sq += np.sum(h_block * h_block) + + with ctx.task(stf.exec_place.host(), handle.read()) as t: + # Synchronize the stream before reading data + cp.cuda.runtime.streamSynchronize(t.stream_ptr()) + + h_block = cai_to_numpy(t.get_arg_cai(0)) + compute_block_norm(h_block) + + return np.sqrt(norm_sq) + + +def main(): + import argparse + + parser = argparse.ArgumentParser( + description="Tiled Cholesky decomposition with CUDA STF" + ) + parser.add_argument( + "N", type=int, nargs="?", default=1024, help="Matrix size (default: 1024)" + ) + parser.add_argument( + "NB", type=int, nargs="?", default=128, help="Block size (default: 128)" + ) + parser.add_argument("--check", action="store_true", help="Check result (slower)") + args = parser.parse_args() + + N = args.N + NB = args.NB + check_result = args.check + + assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}" + + print("=" * 60) + print("Tiled Cholesky Decomposition with CUDA STF + CuPy") + print("=" * 60) + print(f"Matrix size: {N}x{N}") + print(f"Block size: {NB}x{NB}") + print(f"Number of blocks: {N // NB}x{N // NB}") + print(f"Check result: {check_result}") + print("=" * 60) + + # Create STF context + ctx = stf.context() + + # Create matrices + A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A") + + if check_result: + Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref") + + # Fill with Hilbert matrix + diagonal dominance + # H_{i,j} = 1/(i+j+1) + 2*N if i==j + def hilbert(row, col): + return 1.0 / (row + col + 1.0) + (2.0 * N if row == col else 0.0) + + print("\n" + "=" * 60) + print("Initializing matrices...") + print("=" * 60) + + A.fill(hilbert) + if check_result: + Aref.fill(hilbert) + + # Create right-hand side + if check_result: + B = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B") + Bref = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref") + + def rhs_vals(row, col): + return 1.0 * (row + 1) + + B.fill(rhs_vals) + Bref.fill(rhs_vals) + + # Compute ||B|| for residual calculation + Bref_norm = compute_norm(ctx, Bref) + + # Synchronize before timing + cp.cuda.runtime.deviceSynchronize() + + # Record start time + start_event = cp.cuda.Event() + stop_event = cp.cuda.Event() + start_event.record() + + # Perform Cholesky factorization + print("\n" + "=" * 60) + print("Performing Cholesky factorization...") + print("=" * 60) + PDPOTRF(ctx, A) + + # Record stop time + stop_event.record() + + # Solve system if checking + if check_result: + print("\n" + "=" * 60) + print("Solving linear system...") + print("=" * 60) + PDPOTRS(ctx, A, B, uplo="L") + + print("\n" + "=" * 60) + print("Computing residual...") + print("=" * 60) + # Compute residual: Bref = Aref @ B - Bref + PDGEMM(ctx, Aref, B, Bref, transa="N", transb="N", alpha=1.0, beta=-1.0) + + # Compute ||residual|| + res_norm = compute_norm(ctx, Bref) + + # Finalize STF context + print("\n" + "=" * 60) + print("Finalizing STF context...") + print("=" * 60) + ctx.finalize() + + # Wait for completion + stop_event.synchronize() + + # Compute timing + elapsed_ms = cp.cuda.get_elapsed_time(start_event, stop_event) + gflops = (1.0 / 3.0 * N * N * N) / 1e9 + gflops_per_sec = gflops / (elapsed_ms / 1000.0) + + print("\n" + "=" * 60) + print("Results") + print("=" * 60) + print(f"[PDPOTRF] Elapsed time: {elapsed_ms:.2f} ms") + print(f"[PDPOTRF] Performance: {gflops_per_sec:.2f} GFLOPS") + + if check_result: + residual = res_norm / Bref_norm + print(f"\n[POTRS] ||AX - B||: {res_norm:.6e}") + print(f"[POTRS] ||B||: {Bref_norm:.6e}") + print(f"[POTRS] Residual (||AX - B||/||B||): {residual:.6e}") + + if residual >= 0.01: + print("\n❌ Algorithm did not converge (residual >= 0.01)") + return 1 + else: + print("\n✅ Algorithm converged successfully!") + + print("=" * 60) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py new file mode 100644 index 00000000000..7a73351df11 --- /dev/null +++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py @@ -0,0 +1,405 @@ +# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +########################################################################### +# Example Fluid +# +# Shows how to implement a simple 2D Stable Fluids solver using +# multidimensional arrays and launches. +# +########################################################################### + +import math + +import warp as wp +import warp.render + +import cuda.stf as stf + + +# Add a stf-specific decorator to the wp. namespace +def stf_kernel(pyfunc): + # let warp decorate normally + kernel = wp.kernel(pyfunc) + + # attach an STF-aware call operator + def _stf_call(*args, dim=None, stream=None, **kwargs): + return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs) + + # monkey-patch a method onto the kernel object + kernel.stf = _stf_call + + return kernel + + +def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs): + # just forward to warp for now + return wp.launch( + kernel, + dim=dim, + inputs=inputs, + stream=stream, + **kwargs, + ) + + +# put it under wp.stf +if not hasattr(wp, "stf"): + + class _stf: + pass + + wp.stf = _stf() + + +wp.stf.kernel = stf_kernel +wp.stf.launch = stf_launch + +grid_width = wp.constant(256) +grid_height = wp.constant(128) + + +@wp.func +def lookup_float(f: wp.array2d(dtype=float), x: int, y: int): + x = wp.clamp(x, 0, grid_width - 1) + y = wp.clamp(y, 0, grid_height - 1) + + return f[x, y] + + +@wp.func +def sample_float(f: wp.array2d(dtype=float), x: float, y: float): + lx = int(wp.floor(x)) + ly = int(wp.floor(y)) + + tx = x - float(lx) + ty = y - float(ly) + + s0 = wp.lerp(lookup_float(f, lx, ly), lookup_float(f, lx + 1, ly), tx) + s1 = wp.lerp(lookup_float(f, lx, ly + 1), lookup_float(f, lx + 1, ly + 1), tx) + + s = wp.lerp(s0, s1, ty) + return s + + +@wp.func +def lookup_vel(f: wp.array2d(dtype=wp.vec2), x: int, y: int): + if x < 0 or x >= grid_width: + return wp.vec2() + if y < 0 or y >= grid_height: + return wp.vec2() + + return f[x, y] + + +@wp.func +def sample_vel(f: wp.array2d(dtype=wp.vec2), x: float, y: float): + lx = int(wp.floor(x)) + ly = int(wp.floor(y)) + + tx = x - float(lx) + ty = y - float(ly) + + s0 = wp.lerp(lookup_vel(f, lx, ly), lookup_vel(f, lx + 1, ly), tx) + s1 = wp.lerp(lookup_vel(f, lx, ly + 1), lookup_vel(f, lx + 1, ly + 1), tx) + + s = wp.lerp(s0, s1, ty) + return s + + +@wp.stf.kernel +def advect( + u0: wp.array2d(dtype=wp.vec2), + u1: wp.array2d(dtype=wp.vec2), + rho0: wp.array2d(dtype=float), + rho1: wp.array2d(dtype=float), + dt: float, +): + i, j = wp.tid() + + u = u0[i, j] + + # trace backward + p = wp.vec2(float(i), float(j)) + p = p - u * dt + + # advect + u1[i, j] = sample_vel(u0, p[0], p[1]) + rho1[i, j] = sample_float(rho0, p[0], p[1]) + + +@wp.stf.kernel +def divergence(u: wp.array2d(dtype=wp.vec2), div: wp.array2d(dtype=float)): + i, j = wp.tid() + + if i == grid_width - 1: + return + if j == grid_height - 1: + return + + dx = (u[i + 1, j][0] - u[i, j][0]) * 0.5 + dy = (u[i, j + 1][1] - u[i, j][1]) * 0.5 + + div[i, j] = dx + dy + + +@wp.stf.kernel +def pressure_solve( + p0: wp.array2d(dtype=float), + p1: wp.array2d(dtype=float), + div: wp.array2d(dtype=float), +): + i, j = wp.tid() + + s1 = lookup_float(p0, i - 1, j) + s2 = lookup_float(p0, i + 1, j) + s3 = lookup_float(p0, i, j - 1) + s4 = lookup_float(p0, i, j + 1) + + # Jacobi update + err = s1 + s2 + s3 + s4 - div[i, j] + + p1[i, j] = err * 0.25 + + +@wp.stf.kernel +def pressure_apply(p: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2)): + i, j = wp.tid() + + if i == 0 or i == grid_width - 1: + return + if j == 0 or j == grid_height - 1: + return + + # pressure gradient + f_p = wp.vec2(p[i + 1, j] - p[i - 1, j], p[i, j + 1] - p[i, j - 1]) * 0.5 + + u[i, j] = u[i, j] - f_p + + +@wp.stf.kernel +def integrate(u: wp.array2d(dtype=wp.vec2), rho: wp.array2d(dtype=float), dt: float): + i, j = wp.tid() + + # gravity + f_g = wp.vec2(-90.8, 0.0) * rho[i, j] + + # integrate + u[i, j] = u[i, j] + dt * f_g + + # fade + rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt) + + +@wp.stf.kernel +def init( + rho: wp.array2d(dtype=float), + u: wp.array2d(dtype=wp.vec2), + radius: int, + dir: wp.vec2, +): + i, j = wp.tid() + + d = wp.length(wp.vec2(float(i - grid_width / 2), float(j - grid_height / 2))) + + if d < radius: + rho[i, j] = 1.0 + u[i, j] = dir + + +class Example: + def __init__(self): + fps = 60 + self.frame_dt = 1.0 / fps + self.sim_substeps = 2 + self.iterations = 100 # Number of pressure iterations + self.sim_dt = self.frame_dt / self.sim_substeps + self.sim_time = 0.0 + + self._stf_ctx = stf.context() + + shape = (grid_width, grid_height) + + self.u0 = wp.zeros(shape, dtype=wp.vec2) + self.u1 = wp.zeros(shape, dtype=wp.vec2) + + self.rho0 = wp.zeros(shape, dtype=float) + self.rho1 = wp.zeros(shape, dtype=float) + + self.p0 = wp.zeros(shape, dtype=float) + self.p1 = wp.zeros(shape, dtype=float) + self.div = wp.zeros(shape, dtype=float) + + # Create STF logical data from Warp arrays with explicit data place + # Warp arrays are on GPU device memory, so specify data_place.device() + + # For regular float arrays, specify device data place + device_place = stf.data_place.device(0) + + self.rho0._stf_ld = self._stf_ctx.logical_data( + self.rho0, device_place, name="density_current" + ) + self.rho1._stf_ld = self._stf_ctx.logical_data( + self.rho1, device_place, name="density_next" + ) + self.p0._stf_ld = self._stf_ctx.logical_data( + self.p0, device_place, name="pressure_current" + ) + self.p1._stf_ld = self._stf_ctx.logical_data( + self.p1, device_place, name="pressure_next" + ) + self.div._stf_ld = self._stf_ctx.logical_data( + self.div, device_place, name="velocity_divergence" + ) + + # vec2 arrays - STF now automatically handles vector type flattening + # Store STF logical data consistently with other arrays + self.u0._stf_ld = self._stf_ctx.logical_data( + self.u0, device_place, name="velocity_current" + ) + self.u1._stf_ld = self._stf_ctx.logical_data( + self.u1, device_place, name="velocity_next" + ) + + # Set Warp array names (for Warp tracing) + self.u0._name = "u0" + self.u1._name = "u1" + self.rho0._name = "rho0" + self.rho1._name = "rho1" + self.p0._name = "p0" + self.p1._name = "p1" + self.div._name = "div" + + # capture pressure solve as a CUDA graph + self.use_cuda_graph = wp.get_device().is_cuda + if self.use_cuda_graph: + with wp.ScopedCapture() as capture: + self.pressure_iterations() + self.graph = capture.graph + + def step(self): + with wp.ScopedTimer("step"): + for _ in range(self.sim_substeps): + shape = (grid_width, grid_height) + dt = self.sim_dt + + speed = 400.0 + angle = math.sin(self.sim_time * 4.0) * 1.5 + vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed) + + # update emitters + wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel]) + + # force integrate + wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt]) + wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div]) + + # pressure solve + self.p0.zero_() + self.p1.zero_() + + # TODO experiment with explicit capture at Warp level + # if self.use_cuda_graph: + # wp.capture_launch(self.graph) + # else: + # self.pressure_iterations() + self.pressure_iterations() + + # velocity update + wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0]) + + # semi-Lagrangian advection + wp.stf.launch( + advect, + dim=shape, + inputs=[self.u0, self.u1, self.rho0, self.rho1, dt], + ) + + # swap buffers + (self.u0, self.u1) = (self.u1, self.u0) + (self.rho0, self.rho1) = (self.rho1, self.rho0) + + self.sim_time += dt + + def pressure_iterations(self): + for _ in range(self.iterations): + wp.stf.launch( + pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div] + ) + + # swap pressure fields + (self.p0, self.p1) = (self.p1, self.p0) + + def step_and_render_frame(self, frame_num=None, img=None): + self.step() + + with wp.ScopedTimer("render"): + if img: + img.set_array(self.rho0.numpy()) + + return (img,) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser( + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument( + "--device", type=str, default=None, help="Override the default Warp device." + ) + parser.add_argument( + "--num_frames", type=int, default=100000, help="Total number of frames." + ) + parser.add_argument( + "--headless", + action="store_true", + help="Run in headless mode, suppressing the opening of any graphical windows.", + ) + + args = parser.parse_known_args()[0] + + with wp.ScopedDevice(args.device): + example = Example() + + if args.headless: + for _ in range(args.num_frames): + example.step() + else: + import matplotlib + import matplotlib.animation as anim + import matplotlib.pyplot as plt + + fig = plt.figure() + + img = plt.imshow( + example.rho0.numpy(), + origin="lower", + animated=True, + interpolation="antialiased", + ) + img.set_norm(matplotlib.colors.Normalize(0.0, 1.0)) + seq = anim.FuncAnimation( + fig, + example.step_and_render_frame, + fargs=(img,), + frames=args.num_frames, + blit=True, + interval=8, + repeat=False, + ) + + plt.show() diff --git a/python/cuda_cccl/tests/stf/example_potri.py b/python/cuda_cccl/tests/stf/example_potri.py new file mode 100644 index 00000000000..ed2e8c1e91b --- /dev/null +++ b/python/cuda_cccl/tests/stf/example_potri.py @@ -0,0 +1,836 @@ +#!/usr/bin/env python3 +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Python implementation of POTRI (matrix inversion via Cholesky) using CUDA STF and CuPy. + +POTRI computes the inverse of a symmetric positive definite matrix using its Cholesky factorization: +1. Cholesky factorization: A = L*L^T +2. Triangular inversion: L^(-1) +3. Compute A^(-1) = L^(-T) * L^(-1) + +This example demonstrates: +- Tiled matrix operations with STF logical data +- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks +- Multi-device execution with automatic data placement +- Task-based parallelism for linear algebra operations +""" + +import sys + +import cupy as cp +import numpy as np +from cupyx.scipy import linalg as cp_linalg + +import cuda.stf as stf + + +class CAIWrapper: + """Wrapper to expose CUDA Array Interface dict as a proper CAI object.""" + + def __init__(self, cai_dict): + self.__cuda_array_interface__ = cai_dict + + +def get_cupy_arrays(task): + """ + Get all CuPy arrays from STF task arguments. + + Usage: + d_a, d_b, d_c = get_cupy_arrays(t) + """ + arrays = [] + idx = 0 + while True: + try: + arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx)))) + idx += 1 + except Exception: + break + return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None + + +def cai_to_numpy(cai_dict): + """Convert CUDA Array Interface dict to NumPy array (for host memory).""" + import ctypes + + # Extract CAI fields + data_ptr, readonly = cai_dict["data"] + shape = cai_dict["shape"] + typestr = cai_dict["typestr"] + + # Convert typestr to NumPy dtype + dtype = np.dtype(typestr) + + # Calculate total size in bytes + itemsize = dtype.itemsize + size = np.prod(shape) * itemsize + + # Create ctypes buffer from pointer + buffer = (ctypes.c_byte * size).from_address(data_ptr) + + # Create NumPy array from buffer + arr = np.frombuffer(buffer, dtype=dtype).reshape(shape) + + return arr + + +class BlockRef: + """Reference to a specific block in a tiled matrix.""" + + def __init__(self, matrix, row, col): + self.matrix = matrix + self.row = row + self.col = col + self._handle = matrix.handle(row, col) + self._devid = matrix.get_preferred_devid(row, col) + + def handle(self): + """Get the STF logical data handle for this block.""" + return self._handle + + def devid(self): + """Get the preferred device ID for this block.""" + return self._devid + + def __repr__(self): + return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])" + + +class TiledMatrix: + """ + Tiled matrix class that splits a matrix into blocks for parallel processing. + Each block is managed as an STF logical data object. + Uses tiled storage format for contiguous blocks. + """ + + def __init__( + self, + ctx, + nrows, + ncols, + blocksize_rows, + blocksize_cols, + is_symmetric=False, + symbol="matrix", + dtype=np.float64, + ): + self.ctx = ctx + self.symbol = symbol + self.dtype = dtype + self.sym_matrix = is_symmetric + + self.m = nrows + self.n = ncols + self.mb = blocksize_rows + self.nb = blocksize_cols + + assert self.m % self.mb == 0, ( + f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}" + ) + assert self.n % self.nb == 0, ( + f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}" + ) + + # Number of blocks + self.mt = self.m // self.mb + self.nt = self.n // self.nb + + # Allocate pinned host memory for faster transfers (in tiled format) + self.h_array = cp.cuda.alloc_pinned_memory( + self.m * self.n * np.dtype(dtype).itemsize + ) + self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape( + self.m, self.n + ) + + # Dictionary to store logical data handles for each block + self.handles = {} + + # Determine device layout + self.ndevs = cp.cuda.runtime.getDeviceCount() + self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs) + + print( + f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}" + ) + print( + f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid" + ) + + def _compute_device_grid(self, ndevs): + """Compute 2D device grid dimensions (as close to square as possible)""" + grid_p = 1 + grid_q = ndevs + for a in range(1, int(np.sqrt(ndevs)) + 1): + if ndevs % a == 0: + grid_p = a + grid_q = ndevs // a + return grid_p, grid_q + + def get_preferred_devid(self, row, col): + """Get preferred device ID for a given block using cyclic distribution""" + return (row % self.grid_p) + (col % self.grid_q) * self.grid_p + + def handle(self, row, col): + """Get the logical data handle for a block.""" + return self.handles[(row, col)] + + def block(self, row, col): + """Get a BlockRef for block (row, col)""" + return BlockRef(self, row, col) + + def _get_index(self, row, col): + """Convert (row, col) to linear index in tiled storage""" + tile_row = row // self.mb + tile_col = col // self.nb + tile_size = self.mb * self.nb + tile_start = (tile_row + self.mt * tile_col) * tile_size + offset = (row % self.mb) + (col % self.nb) * self.mb + return tile_start + offset + + def _get_block_h(self, brow, bcol): + """Get a view of the host data for block (brow, bcol)""" + # For tiled storage, blocks are stored contiguously + start_idx = (brow + self.mt * bcol) * self.mb * self.nb + end_idx = start_idx + self.mb * self.nb + flat_view = self.h_array_np.ravel() + return flat_view[start_idx:end_idx].reshape(self.mb, self.nb) + + def fill(self, func): + """ + Fill the matrix blocks using a function func(row, col) -> value. + Creates STF logical data from host arrays and lets STF handle transfers. + """ + print(f"[{self.symbol}] Filling matrix on host...") + for colb in range(self.nt): + low_rowb = colb if self.sym_matrix else 0 + for rowb in range(low_rowb, self.mt): + # Fill host block + h_block = self._get_block_h(rowb, colb) + for lrow in range(self.mb): + for lcol in range(self.nb): + row = lrow + rowb * self.mb + col = lcol + colb * self.nb + h_block[lrow, lcol] = func(row, col) + + handle = self.ctx.logical_data( + h_block, name=f"{self.symbol}_{rowb}_{colb}" + ) + self.handles[(rowb, colb)] = handle + + +# ============================================================================ +# Block-level operations (BLAS/LAPACK) +# ============================================================================ + + +def DPOTRF(ctx, a): + """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + d_block[:] = cp.linalg.cholesky(d_block) + + +def DTRSM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0): + """Triangular solve: B = alpha * op(A)^(-1) * B""" + with ctx.task( + stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw() + ) as t: + d_a, d_b = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + lower = uplo == "L" + trans = transa != "N" + result = cp_linalg.solve_triangular(d_a, d_b, lower=lower, trans=trans) + if alpha != 1.0: + d_b[:] = alpha * result + else: + d_b[:] = result + + +def DTRTRI(ctx, a, uplo="L", diag="N"): + """Triangular matrix inversion: A = A^(-1)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + lower = uplo == "L" + unit_diagonal = diag == "U" + # CuPy doesn't have trtri directly, use solve with identity + n = d_block.shape[0] + identity = cp.eye(n, dtype=d_block.dtype) + d_block[:] = cp_linalg.solve_triangular( + d_block, identity, lower=lower, unit_diagonal=unit_diagonal + ) + + +def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0): + """General matrix multiplication: C = alpha * op(A) * op(B) + beta * C""" + with ctx.task( + stf.exec_place.device(c.devid()), + a.handle().read(), + b.handle().read(), + c.handle().rw(), + ) as t: + d_a, d_b, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if transa != "N" else d_a + op_b = d_b.T if transb != "N" else d_b + + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_b) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_b) + else: + d_c[:] = alpha * (op_a @ op_b) + beta * d_c + + +def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0): + """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C""" + with ctx.task( + stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw() + ) as t: + d_a, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + op_a = d_a.T if trans != "N" else d_a + + if beta == 0.0: + d_c[:] = alpha * (op_a @ op_a.T) + elif beta == 1.0: + d_c[:] += alpha * (op_a @ op_a.T) + else: + d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c + + +def DTRMM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0): + """Triangular matrix multiplication: B = alpha * op(A) * B (side='L') or B = alpha * B * op(A) (side='R')""" + with ctx.task( + stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw() + ) as t: + d_a, d_b = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + lower = uplo == "L" + trans = transa != "N" + + # Extract triangle from A + if lower: + tri_a = cp.tril(d_a) + else: + tri_a = cp.triu(d_a) + + if trans: + tri_a = tri_a.T + + if side == "L": + d_b[:] = alpha * (tri_a @ d_b) + else: # side == 'R' + d_b[:] = alpha * (d_b @ tri_a) + + +def DSYMM(ctx, a, b, c, side="L", uplo="L", alpha=1.0, beta=1.0): + """Symmetric matrix multiplication: C = alpha * A * B + beta * C (side='L') or C = alpha * B * A + beta * C (side='R') + where A is symmetric.""" + with ctx.task( + stf.exec_place.device(c.devid()), + a.handle().read(), + b.handle().read(), + c.handle().rw(), + ) as t: + d_a, d_b, d_c = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + # Reconstruct full symmetric matrix from lower/upper triangle + if uplo == "L": + # Lower triangle is stored + sym_a = cp.tril(d_a) + cp.tril(d_a, -1).T + else: + # Upper triangle is stored + sym_a = cp.triu(d_a) + cp.triu(d_a, 1).T + + if side == "L": + result = alpha * (sym_a @ d_b) + else: # side == 'R' + result = alpha * (d_b @ sym_a) + + if beta == 0.0: + d_c[:] = result + elif beta == 1.0: + d_c[:] += result + else: + d_c[:] = result + beta * d_c + + +# ============================================================================ +# Tiled operations +# ============================================================================ + + +def PDPOTRF(ctx, A, uplo="L"): + """Parallel tiled Cholesky factorization""" + print("\n[PDPOTRF] Starting Cholesky factorization...") + assert uplo == "L", "Only lower triangular factorization supported" + + for k in range(A.nt): + # Factorize diagonal block + DPOTRF(ctx, A.block(k, k)) + + # Update column below diagonal + for m in range(k + 1, A.mt): + DTRSM( + ctx, + A.block(k, k), + A.block(m, k), + side="R", + uplo="L", + transa="T", + diag="N", + alpha=1.0, + ) + + # Update trailing submatrix + for n in range(k + 1, A.nt): + DSYRK( + ctx, + A.block(n, k), + A.block(n, n), + uplo="L", + trans="N", + alpha=-1.0, + beta=1.0, + ) + + for m in range(n + 1, A.mt): + DGEMM( + ctx, + A.block(m, k), + A.block(n, k), + A.block(m, n), + transa="N", + transb="T", + alpha=-1.0, + beta=1.0, + ) + + print("[PDPOTRF] Completed") + + +def PDTRTRI(ctx, A, uplo="L", diag="N"): + """Parallel tiled triangular matrix inversion""" + print("\n[PDTRTRI] Starting triangular inversion...") + assert uplo == "L", "Only lower triangular inversion supported" + + for k in range(A.nt): + # Step 1: Update A[m,k] for m > k + for m in range(k + 1, A.mt): + DTRSM( + ctx, + A.block(k, k), + A.block(m, k), + side="R", + uplo="L", + transa="N", + diag=diag, + alpha=-1.0, + ) + + # Step 2: Update A[m,n] for m > k, n < k + for m in range(k + 1, A.mt): + for n in range(k): + DGEMM( + ctx, + A.block(m, k), + A.block(k, n), + A.block(m, n), + transa="N", + transb="N", + alpha=1.0, + beta=1.0, + ) + + # Step 3: Update A[k,n] for n < k + for n in range(k): + DTRSM( + ctx, + A.block(k, k), + A.block(k, n), + side="L", + uplo="L", + transa="N", + diag=diag, + alpha=1.0, + ) + + # Step 4: Invert diagonal block A[k,k] + DTRTRI(ctx, A.block(k, k), uplo=uplo, diag=diag) + + print("[PDTRTRI] Completed") + + +def DLAAUM(ctx, a, uplo="L"): + """Compute A^T * A for a triangular block (lauum operation)""" + with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t: + d_block = get_cupy_arrays(t) + with cp.cuda.ExternalStream(t.stream_ptr()): + # lauum: compute L * L^T for lower triangular L + if uplo == "L": + L = cp.tril(d_block) + d_block[:] = L @ L.T + else: + U = cp.triu(d_block) + d_block[:] = U.T @ U + + +def PDLAUUM(ctx, A, uplo="L"): + """Parallel tiled computation of A^T * A for lower triangular A""" + print("\n[PDLAUUM] Starting LAUUM (A^T * A)...") + assert uplo == "L", "Only lower triangular LAUUM supported" + + for k in range(A.mt): + # Step 1: Update off-diagonal blocks + for n in range(k): + # Update A[n,n] with A[k,n]^T * A[k,n] + DSYRK( + ctx, + A.block(k, n), + A.block(n, n), + uplo="L", + trans="T", + alpha=1.0, + beta=1.0, + ) + + # Update A[m,n] with A[k,m]^T * A[k,n] + for m in range(n + 1, k): + DGEMM( + ctx, + A.block(k, m), + A.block(k, n), + A.block(m, n), + transa="T", + transb="N", + alpha=1.0, + beta=1.0, + ) + + # Step 2: Update A[k,n] = A[k,k]^T * A[k,n] + for n in range(k): + DTRMM( + ctx, + A.block(k, k), + A.block(k, n), + side="L", + uplo="L", + transa="T", + diag="N", + alpha=1.0, + ) + + # Step 3: Update diagonal block A[k,k] = A[k,k]^T * A[k,k] + DLAAUM(ctx, A.block(k, k), uplo=uplo) + + print("[PDLAUUM] Completed") + + +def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0): + """Parallel tiled matrix multiplication""" + print("\n[PDGEMM] Starting matrix multiplication...") + + for m in range(C.mt): + for n in range(C.nt): + inner_k = A.nt if transa == "N" else A.mt + + if alpha == 0.0 or inner_k == 0: + # Just scale C + DGEMM( + ctx, + A.block(0, 0), + B.block(0, 0), + C.block(m, n), + transa=transa, + transb=transb, + alpha=0.0, + beta=beta, + ) + elif transa == "N": + if transb == "N": + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(m, k), + B.block(k, n), + C.block(m, n), + transa="N", + transb="N", + alpha=alpha, + beta=zbeta, + ) + else: + for k in range(A.nt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(m, k), + B.block(n, k), + C.block(m, n), + transa="N", + transb="T", + alpha=alpha, + beta=zbeta, + ) + else: # transa in ['T', 'C'] + if transb == "N": + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(k, m), + B.block(k, n), + C.block(m, n), + transa="T", + transb="N", + alpha=alpha, + beta=zbeta, + ) + else: + for k in range(A.mt): + zbeta = beta if k == 0 else 1.0 + DGEMM( + ctx, + A.block(k, m), + B.block(n, k), + C.block(m, n), + transa="T", + transb="T", + alpha=alpha, + beta=zbeta, + ) + + print("[PDGEMM] Completed") + + +def PDSYMM(ctx, A, B, C, side="L", uplo="L", alpha=1.0, beta=1.0): + """Parallel tiled symmetric matrix multiplication""" + print("\n[PDSYMM] Starting symmetric matrix multiplication...") + + for m in range(C.mt): + for n in range(C.nt): + if side == "L": + if uplo == "L": + for k in range(C.mt): + zbeta = beta if k == 0 else 1.0 + if k < m: + DGEMM( + ctx, + A.block(m, k), + B.block(k, n), + C.block(m, n), + transa="N", + transb="N", + alpha=alpha, + beta=zbeta, + ) + else: + if k == m: + DSYMM( + ctx, + A.block(k, k), + B.block(k, n), + C.block(m, n), + side=side, + uplo=uplo, + alpha=alpha, + beta=zbeta, + ) + else: + DGEMM( + ctx, + A.block(k, m), + B.block(k, n), + C.block(m, n), + transa="T", + transb="N", + alpha=alpha, + beta=zbeta, + ) + else: # side == 'R' + # Similar logic for right multiplication + pass + + print("[PDSYMM] Completed") + + +def compute_norm(ctx, matrix): + """Compute Frobenius norm of matrix using host tasks""" + norm_sq = 0.0 + + for colb in range(matrix.nt): + low_rowb = colb if matrix.sym_matrix else 0 + for rowb in range(low_rowb, matrix.mt): + handle = matrix.handle(rowb, colb) + + # Host task to read the block and compute norm + def compute_block_norm(h_block): + nonlocal norm_sq + norm_sq += np.sum(h_block * h_block) + + with ctx.task(stf.exec_place.host(), handle.read()) as t: + # Synchronize the stream before reading data + cp.cuda.runtime.streamSynchronize(t.stream_ptr()) + + h_block = cai_to_numpy(t.get_arg_cai(0)) + compute_block_norm(h_block) + + return np.sqrt(norm_sq) + + +def main(): + import argparse + + parser = argparse.ArgumentParser( + description="Tiled POTRI (matrix inversion via Cholesky) with CUDA STF" + ) + parser.add_argument( + "N", type=int, nargs="?", default=512, help="Matrix size (default: 512)" + ) + parser.add_argument( + "NB", type=int, nargs="?", default=128, help="Block size (default: 128)" + ) + parser.add_argument("--check", action="store_true", help="Check result (slower)") + args = parser.parse_args() + + N = args.N + NB = args.NB + check_result = args.check + + assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}" + + print("=" * 60) + print("Tiled POTRI (Matrix Inversion) with CUDA STF + CuPy") + print("=" * 60) + print(f"Matrix size: {N}x{N}") + print(f"Block size: {NB}x{NB}") + print(f"Number of blocks: {N // NB}x{N // NB}") + print(f"Check result: {check_result}") + print("=" * 60) + + # Create STF context + ctx = stf.context() + + # Create matrices + A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A") + + if check_result: + Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref") + + print("\n" + "=" * 60) + print("Initializing matrices...") + print("=" * 60) + + # Hilbert matrix + diagonal dominance for numerical stability + def hilbert(row, col): + return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row) + + A.fill(hilbert) + if check_result: + Aref.fill(hilbert) + + # Measure performance + import time + + start_time = time.time() + + print("\n" + "=" * 60) + print("Performing POTRI (inversion via Cholesky)...") + print("=" * 60) + + # Step 1: Cholesky factorization A = L*L^T + PDPOTRF(ctx, A, uplo="L") + + # Step 2: Triangular inversion L^(-1) + PDTRTRI(ctx, A, uplo="L", diag="N") + + # Step 3: Compute A^(-1) = L^(-T) * L^(-1) + PDLAUUM(ctx, A, uplo="L") + + if check_result: + print("\n" + "=" * 60) + print("Verifying result...") + print("=" * 60) + + # Create test vector B + B_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_potri") + Bref_potri = TiledMatrix( + ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri" + ) + + def rhs_vals(row, col): + return 1.0 * (row + 1) + + B_potri.fill(rhs_vals) + Bref_potri.fill(rhs_vals) + + # Compute norm of B + b_norm = compute_norm(ctx, Bref_potri) + + # Create temporary matrix for result + B_tmp = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_tmp") + + def zero_vals(row, col): + return 0.0 + + B_tmp.fill(zero_vals) + + # Compute B_tmp = A^(-1) * B + PDSYMM(ctx, A, B_potri, B_tmp, side="L", uplo="L", alpha=1.0, beta=0.0) + + # Compute residual: Bref = Aref * B_tmp - Bref + PDGEMM( + ctx, Aref, B_tmp, Bref_potri, transa="N", transb="N", alpha=1.0, beta=-1.0 + ) + + # Compute residual norm + res_norm = compute_norm(ctx, Bref_potri) + + print("\n" + "=" * 60) + print("Finalizing STF context...") + print("=" * 60) + ctx.finalize() + + end_time = time.time() + elapsed_ms = (end_time - start_time) * 1000.0 + + # Compute FLOPS for POTRI + # POTRF: (1/3) * N^3 + # TRTRI: (1/3) * N^3 + # LAUUM: (1/3) * N^3 + # Total: N^3 + flops = float(N) ** 3 + gflops = flops / (elapsed_ms / 1000.0) / 1e9 + + print("\n" + "=" * 60) + print("Results") + print("=" * 60) + print(f"[POTRI] Elapsed time: {elapsed_ms:.2f} ms") + print(f"[POTRI] Performance: {gflops:.2f} GFLOPS") + + if check_result: + residual = res_norm / b_norm + print(f"\n[POTRI] ||A * (A^(-1) * B) - B||: {res_norm:.6e}") + print(f"[POTRI] ||B||: {b_norm:.6e}") + print(f"[POTRI] Residual (||A * (A^(-1) * B) - B||/||B||): {residual:.6e}") + + if residual < 0.01: + print("\n✅ Algorithm converged successfully!") + return 0 + else: + print(f"\n❌ Algorithm did not converge (residual {residual:.6e} >= 0.01)") + return 1 + + print("=" * 60) + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/python/cuda_cccl/tests/stf/numba_decorator.py b/python/cuda_cccl/tests/stf/numba_decorator.py new file mode 100644 index 00000000000..42c14899789 --- /dev/null +++ b/python/cuda_cccl/tests/stf/numba_decorator.py @@ -0,0 +1,107 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Numba-based @jit decorator for STF. Not shipped in the wheel. +Requires numba-cuda. Import from tests.stf when running from source. +""" + +from numba import cuda + +from cuda.stf import context, dep, exec_place + + +class stf_kernel_decorator: + def __init__(self, pyfunc, jit_args, jit_kwargs): + self._pyfunc = pyfunc + self._jit_args = jit_args + self._jit_kwargs = jit_kwargs + self._compiled_kernel = None + self._launch_cfg = None + + def __getitem__(self, cfg): + if not (isinstance(cfg, tuple) or isinstance(cfg, list)): + raise TypeError("use kernel[grid, block ([, exec_place, ctx])]") + n = len(cfg) + if n not in (2, 3, 4): + raise TypeError( + "use kernel[grid, block], kernel[grid, block, exec_place], or kernel[grid, block, exec_place, ctx]" + ) + + grid_dim = cfg[0] + block_dim = cfg[1] + ctx = None + exec_pl = None + + if n >= 3: + exec_pl = cfg[2] + + if n == 4: + ctx = cfg[3] + + if exec_pl is not None and not isinstance(exec_pl, exec_place): + raise TypeError("3rd item must be an exec_place") + + if ctx is not None and not isinstance(ctx, context): + raise TypeError("4th item must be an STF context (or None to infer)") + + self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl) + return self + + def __call__(self, *args, **kwargs): + if self._launch_cfg is None: + raise RuntimeError( + "launch configuration missing – use kernel[grid, block, ctx](...)" + ) + + gridDim, blockDim, ctx, exec_pl = self._launch_cfg + + dep_items = [] + for i, a in enumerate(args): + if isinstance(a, dep): + if ctx is None: + ld = a.get_ld() + ctx = ld.borrow_ctx_handle() + dep_items.append((i, a)) + + task_args = [exec_pl] if exec_pl else [] + task_args.extend(a for _, a in dep_items) + + with ctx.task(*task_args) as t: + dev_args = list(args) + for dep_index, (pos, _) in enumerate(dep_items): + cai = t.get_arg_cai(dep_index) + dev_args[pos] = cuda.from_cuda_array_interface( + cai.__cuda_array_interface__, owner=None, sync=False + ) + + if self._compiled_kernel is None: + self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)( + self._pyfunc + ) + + nb_stream = cuda.external_stream(t.stream_ptr()) + self._compiled_kernel[gridDim, blockDim, nb_stream](*dev_args, **kwargs) + + return None + + +def jit(*jit_args, **jit_kwargs): + if jit_args and callable(jit_args[0]): + pyfunc = jit_args[0] + return _build_kernel(pyfunc, (), jit_kwargs) + + def _decorator(fn): + return _build_kernel(fn, jit_args, jit_kwargs) + + return _decorator + + +def _build_kernel(pyfunc, jit_args, jit_kwargs=None): + if jit_kwargs is None: + jit_kwargs = {} + return stf_kernel_decorator(pyfunc, jit_args, jit_kwargs) + + +__all__ = ["jit", "stf_kernel_decorator"] diff --git a/python/cuda_cccl/tests/stf/numba_helpers.py b/python/cuda_cccl/tests/stf/numba_helpers.py new file mode 100644 index 00000000000..9156f73c095 --- /dev/null +++ b/python/cuda_cccl/tests/stf/numba_helpers.py @@ -0,0 +1,40 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +Numba helpers for cuda.stf tests. Not shipped in the wheel. +Convert task.get_arg_cai() / task.args_cai() (stf_cai) to Numba device arrays. +Requires numba-cuda. Import from tests.stf when running from source. +""" + +from __future__ import annotations + + +def get_arg_numba(task, index): + """Return one task argument as a Numba device array. task.get_arg_cai(index) returns stf_cai.""" + from numba import cuda + + return cuda.from_cuda_array_interface( + task.get_arg_cai(index), owner=None, sync=False + ) + + +def numba_arguments(task): + """ + Return all task buffer arguments as Numba device arrays. Same shape as task.args_cai(): + None, a single array, or a tuple of arrays. + """ + from numba import cuda + + out = task.args_cai() + if out is None: + return None + if isinstance(out, tuple): + return tuple( + cuda.from_cuda_array_interface(o, owner=None, sync=False) for o in out + ) + return cuda.from_cuda_array_interface(out, owner=None, sync=False) + + +__all__ = ["get_arg_numba", "numba_arguments"] diff --git a/python/cuda_cccl/tests/stf/pytorch_task.py b/python/cuda_cccl/tests/stf/pytorch_task.py new file mode 100644 index 00000000000..27651bc12b1 --- /dev/null +++ b/python/cuda_cccl/tests/stf/pytorch_task.py @@ -0,0 +1,88 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +""" +PyTorch-integrated task context manager for STF tests/examples. +Not shipped in the wheel. Uses task.args_cai() (CAI from cuda.stf), converts to +torch.Tensor here so cuda.stf has no PyTorch dependency. Requires PyTorch. +""" + +from __future__ import annotations + + +def tensor_arg(task, index): + """Return one task argument as a torch.Tensor. task.get_arg_cai() returns an stf_cai (has __cuda_array_interface__).""" + import torch + + return torch.as_tensor(task.get_arg_cai(index)) + + +def tensor_arguments(task): + """ + Return all task buffer arguments as torch.Tensors. Same shape as task.args_cai(): + None, a single tensor, or a tuple of tensors. task.args_cai() returns stf_cai object(s). + """ + import torch + + out = task.args_cai() + if out is None: + return None + if isinstance(out, tuple): + return tuple(torch.as_tensor(o) for o in out) + return torch.as_tensor(out) + + +def pytorch_task(ctx, *args): + """ + Context manager: ctx.task(*args) with PyTorch stream and tensor conversion. + Yields tensor(s) from task.args_cai() converted to torch, as a tuple. + + Example + ------- + >>> from tests.stf.pytorch_task import pytorch_task + >>> with pytorch_task(ctx, lX.read(), lY.rw()) as (x_tensor, y_tensor): + ... y_tensor[:] = x_tensor * 2 + """ + try: + import torch.cuda as tc + except ImportError: + raise RuntimeError( + "pytorch_task requires PyTorch to be installed. " + "Install PyTorch or use ctx.task() for a raw task." + ) from None + + t = ctx.task(*args) + + class _PyTorchTaskContext: + _stream_ctx = None + + def __enter__(self): + t.start() + self._stream_ctx = None + try: + stream = tc.ExternalStream(t.stream_ptr()) + self._stream_ctx = tc.stream(stream) + self._stream_ctx.__enter__() + except Exception: + t.end() + raise + tensors = tensor_arguments(t) + if tensors is None: + return None + if isinstance(tensors, tuple): + return tensors + return (tensors,) + + def __exit__(self, exc_type, exc_val, exc_tb): + try: + if self._stream_ctx is not None: + self._stream_ctx.__exit__(exc_type, exc_val, exc_tb) + finally: + t.end() + return False + + return _PyTorchTaskContext() + + +__all__ = ["pytorch_task", "tensor_arg", "tensor_arguments"] diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py new file mode 100644 index 00000000000..f815d267c5a --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_context.py @@ -0,0 +1,75 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import numpy as np + +import cuda.stf as stf + + +def test_ctx(): + ctx = stf.context() + del ctx + + +def test_graph_ctx(): + ctx = stf.context(use_graph=True) + ctx.finalize() + + +def test_ctx2(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + t = ctx.task(lX.rw()) + t.start() + t.end() + + t2 = ctx.task(lX.read(), lY.rw()) + t2.start() + t2.end() + + t3 = ctx.task(lX.read(), lZ.rw()) + t3.start() + t3.end() + + t4 = ctx.task(lY.read(), lZ.rw()) + t4.start() + t4.end() + + del ctx + + +def test_ctx3(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(lX.rw()): + pass + + with ctx.task(lX.read(), lY.rw()): + pass + + with ctx.task(lX.read(), lZ.rw()): + pass + + with ctx.task(lY.read(), lZ.rw()): + pass + + del ctx + + +if __name__ == "__main__": + test_ctx3() diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py new file mode 100644 index 00000000000..076cf26700d --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_decorator.py @@ -0,0 +1,50 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import numba +import numpy as np +import pytest +from numba import cuda +from numba_decorator import jit + +import cuda.stf as stf + +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +@jit +def axpy(a, x, y): + i = cuda.grid(1) + if i < x.size: + y[i] = a * x[i] + y[i] + + +@jit +def scale(a, x): + i = cuda.grid(1) + if i < x.size: + x[i] = a * x[i] + + +@pytest.mark.parametrize("use_graph", [True, False]) +def test_decorator(use_graph): + X, Y, Z = (np.ones(16, np.float32) for _ in range(3)) + + ctx = stf.context(use_graph=use_graph) + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + scale[32, 64](2.0, lX.rw()) + axpy[32, 64](2.0, lX.read(), lY.rw()) + axpy[32, 64, stf.exec_place.device(0)]( + 2.0, lX.read(), lZ.rw() + ) # explicit exec place + axpy[32, 64]( + 2.0, lY.read(), lZ.rw(stf.data_place.device(0)) + ) # per-dep placement override + + +if __name__ == "__main__": + test_decorator(False) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py new file mode 100644 index 00000000000..fc4f3f590e2 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py @@ -0,0 +1,232 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import math +from typing import Literal, Optional, Tuple + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") +import torch.cuda as tc # noqa: E402 +from pytorch_task import tensor_arguments # noqa: E402 + +import cuda.stf as stf # noqa: E402 + +try: + import matplotlib.pyplot as plt + + has_matplotlib = True +except ImportError: + has_matplotlib = False + +Plane = Literal["xy", "xz", "yz"] + + +def show_slice(t3d, plane="xy", index=None): + """Display a 2D slice of a 3D tensor (requires matplotlib).""" + if not has_matplotlib: + return + + # grab a 2D view + if plane == "xy": + idx = t3d.shape[2] // 2 if index is None else index + slice2d = t3d[:, :, idx] + elif plane == "xz": + idx = t3d.shape[1] // 2 if index is None else index + slice2d = t3d[:, idx, :] + elif plane == "yz": + idx = t3d.shape[0] // 2 if index is None else index + slice2d = t3d[idx, :, :] + else: + raise ValueError("plane must be 'xy', 'xz' or 'yz'") + + # move to cpu numpy array + arr = slice2d.detach().cpu().numpy() + + # imshow = "imshow" not "imread" + plt.imshow( + arr, + origin="lower", + cmap="seismic", + vmin=-1e-2, + vmax=1e-2, + # norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0) + # norm=LogNorm(vmin=1e-12, vmax=1e-6) + ) + # plt.colorbar() + plt.show(block=False) + plt.pause(0.01) + + +def test_fdtd_3d_pytorch( + size_x: int = 150, + size_y: int = 150, + size_z: int = 150, + timesteps: int = 10, + output_freq: int = 0, + dx: float = 0.01, + dy: float = 0.01, + dz: float = 0.01, + epsilon0: float = 8.85e-12, + mu0: float = 1.256e-6, + device: Optional[torch.device] = None, + dtype: torch.dtype = torch.float64, +) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor +]: + ctx = stf.context() + + # allocate and initialize fields + shape = (size_x, size_y, size_z) + + # Electric field components (initialized to zero) + lex = ctx.logical_data_zeros(shape, dtype=np.float64) + ley = ctx.logical_data_zeros(shape, dtype=np.float64) + lez = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Magnetic field components (initialized to zero) + lhx = ctx.logical_data_zeros(shape, dtype=np.float64) + lhy = ctx.logical_data_zeros(shape, dtype=np.float64) + lhz = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Material properties + lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64) + lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64) + + # CFL (same formula as example) + dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) + + # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1 + i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1) + i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2) + + # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1 + i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1) + i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None) + + # source location (single cell at center) + cx, cy, cz = size_x // 2, size_y // 10, size_z // 2 + + def source(t: float, x: float, y: float, z: float) -> float: + # sin(k*x - omega*t) with f = 1e9 Hz + pi = math.pi + freq = 1.0e9 + omega = 2.0 * pi * freq + wavelength = 3.0e8 / freq + k = 2.0 * pi / wavelength + return math.sin(k * x - omega * t) + + for n in range(int(timesteps)): + # ------------------------- + # update electric fields (Es) + # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] + with ( + ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + ex, hy, hz, epsilon = tensor_arguments(t) + ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dx) + ) * ( + (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) + - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) + ) + + # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] + with ( + ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + ey, hx, hz, epsilon = tensor_arguments(t) + ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dy) + ) * ( + (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) + - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) + ) + + # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] + with ( + ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + ez, hx, hy, epsilon = tensor_arguments(t) + ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dz) + ) * ( + (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) + - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) + ) + + # source at center cell + with ( + ctx.task(lez.rw()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + ez = tensor_arguments(t) + ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) + + # ------------------------- + # update magnetic fields (Hs) + # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] + with ( + ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + hx, ey, ez, mu = tensor_arguments(t) + hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dy) + ) * ( + (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) + - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) + ) + + # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] + with ( + ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + hy, ex, ez, mu = tensor_arguments(t) + hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dz) + ) * ( + (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) + - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) + ) + + # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] + with ( + ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + hz, ex, ey, mu = tensor_arguments(t) + hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dx) + ) * ( + (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) + - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) + ) + + if output_freq > 0 and (n % output_freq) == 0: + with ( + ctx.task(lez.read()) as t, + tc.stream(tc.ExternalStream(t.stream_ptr())), + ): + ez = tensor_arguments(t) + print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + if has_matplotlib: + show_slice(ez, plane="xy") + pass + + ctx.finalize() + + +if __name__ == "__main__": + # Run FDTD simulation + output_freq = 5 if has_matplotlib else 0 + if not has_matplotlib and output_freq > 0: + print("Warning: matplotlib not available, running without visualization") + output_freq = 0 + test_fdtd_3d_pytorch(timesteps=1000, output_freq=output_freq) diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py new file mode 100644 index 00000000000..85a9f7f9c51 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py @@ -0,0 +1,233 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import math +from typing import Literal, Optional, Tuple + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") + +from pytorch_task import pytorch_task # noqa: E402 + +import cuda.stf as stf # noqa: E402 + +try: + import matplotlib.pyplot as plt + + has_matplotlib = True +except ImportError: + has_matplotlib = False + +Plane = Literal["xy", "xz", "yz"] + + +def show_slice(t3d, plane="xy", index=None): + """Display a 2D slice of a 3D tensor (requires matplotlib).""" + if not has_matplotlib: + return + + # grab a 2D view + if plane == "xy": + idx = t3d.shape[2] // 2 if index is None else index + slice2d = t3d[:, :, idx] + elif plane == "xz": + idx = t3d.shape[1] // 2 if index is None else index + slice2d = t3d[:, idx, :] + elif plane == "yz": + idx = t3d.shape[0] // 2 if index is None else index + slice2d = t3d[idx, :, :] + else: + raise ValueError("plane must be 'xy', 'xz' or 'yz'") + + # move to cpu numpy array + arr = slice2d.detach().cpu().numpy() + + # imshow = "imshow" not "imread" + plt.imshow( + arr, + origin="lower", + cmap="seismic", + vmin=-1e-2, + vmax=1e-2, + # norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0) + # norm=LogNorm(vmin=1e-12, vmax=1e-6) + ) + # plt.colorbar() + plt.show(block=False) + plt.pause(0.01) + + +def test_fdtd_3d_pytorch_simplified( + size_x: int = 150, + size_y: int = 150, + size_z: int = 150, + timesteps: int = 10, + output_freq: int = 0, + dx: float = 0.01, + dy: float = 0.01, + dz: float = 0.01, + epsilon0: float = 8.85e-12, + mu0: float = 1.256e-6, + device: Optional[torch.device] = None, + dtype: torch.dtype = torch.float64, +) -> Tuple[ + torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor +]: + """ + FDTD 3D implementation using pytorch_task for simplified syntax. + Demonstrates automatic stream and tensor management. + """ + ctx = stf.context() + + # allocate and initialize fields + shape = (size_x, size_y, size_z) + + # Electric field components (initialized to zero) + lex = ctx.logical_data_zeros(shape, dtype=np.float64) + ley = ctx.logical_data_zeros(shape, dtype=np.float64) + lez = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Magnetic field components (initialized to zero) + lhx = ctx.logical_data_zeros(shape, dtype=np.float64) + lhy = ctx.logical_data_zeros(shape, dtype=np.float64) + lhz = ctx.logical_data_zeros(shape, dtype=np.float64) + + # Material properties + lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64) + lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64) + + # CFL (same formula as example) + dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0) + + # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1 + i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1) + i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2) + + # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1 + i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1) + i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None) + + # source location (single cell at center) + cx, cy, cz = size_x // 2, size_y // 10, size_z // 2 + + def source(t: float, x: float, y: float, z: float) -> float: + # sin(k*x - omega*t) with f = 1e9 Hz + pi = math.pi + freq = 1.0e9 + omega = 2.0 * pi * freq + wavelength = 3.0e8 / freq + k = 2.0 * pi / wavelength + return math.sin(k * x - omega * t) + + for n in range(int(timesteps)): + # ------------------------- + # update electric fields (Es) + # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))] + with pytorch_task(ctx, lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as ( + ex, + hy, + hz, + epsilon, + ): + ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dx) + ) * ( + (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es]) + - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m]) + ) + + # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))] + with pytorch_task(ctx, ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as ( + ey, + hx, + hz, + epsilon, + ): + ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dy) + ) * ( + (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m]) + - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es]) + ) + + # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))] + with pytorch_task(ctx, lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as ( + ez, + hx, + hy, + epsilon, + ): + ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + ( + dt / (epsilon[i_es, j_es, k_es] * dz) + ) * ( + (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es]) + - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es]) + ) + + # source at center cell + with pytorch_task(ctx, lez.rw()) as (ez,): + ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz) + + # ------------------------- + # update magnetic fields (Hs) + # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))] + with pytorch_task(ctx, lhx.rw(), ley.read(), lez.read(), lmu.read()) as ( + hx, + ey, + ez, + mu, + ): + hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dy) + ) * ( + (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs]) + - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs]) + ) + + # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))] + with pytorch_task(ctx, lhy.rw(), lex.read(), lez.read(), lmu.read()) as ( + hy, + ex, + ez, + mu, + ): + hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dz) + ) * ( + (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs]) + - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs]) + ) + + # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))] + with pytorch_task(ctx, lhz.rw(), lex.read(), ley.read(), lmu.read()) as ( + hz, + ex, + ey, + mu, + ): + hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - ( + dt / (mu[i_hs, j_hs, k_hs] * dx) + ) * ( + (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs]) + - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs]) + ) + + if output_freq > 0 and (n % output_freq) == 0: + with pytorch_task(ctx, lez.read()) as (ez,): + print(f"{n}\t{ez[cx, cy, cz].item():.6e}") + if has_matplotlib: + show_slice(ez, plane="xy") + + ctx.finalize() + + +if __name__ == "__main__": + # Run simplified FDTD simulation using pytorch_task + output_freq = 5 if has_matplotlib else 0 + if not has_matplotlib and output_freq > 0: + print("Warning: matplotlib not available, running without visualization") + output_freq = 0 + test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=output_freq) diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py new file mode 100644 index 00000000000..7d58c02fbeb --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fhe.py @@ -0,0 +1,143 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Toy Fully Homomorphic Encryption (FHE) example with addition encryption + +import numba +from numba import cuda +from numba_helpers import numba_arguments + +import cuda.stf as stf + +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +class Plaintext: + def __init__(self, ctx, values=None, ld=None, key=0x42, name=None): + self.ctx = ctx + self.key = key + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values, name=name) + + def encrypt(self) -> "Ciphertext": + encrypted = bytearray([(c + self.key) & 0xFF for c in self.values]) + return Ciphertext(self.ctx, values=encrypted, key=self.key) + + def print_values(self): + with self.ctx.task( + stf.exec_place.host(), self.l.read(stf.data_place.managed()) + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() + hvalues = numba_arguments(t) + print([v for v in hvalues]) + + +@cuda.jit +def add_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = (a[i] + b[i]) & 0xFF + + +@cuda.jit +def sub_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = (a[i] - b[i]) & 0xFF + + +@cuda.jit +def sub_scalar_kernel(a, out, v): + i = cuda.grid(1) + if i < out.size: + out[i] = (a[i] - v) & 0xFF + + +class Ciphertext: + def __init__(self, ctx, values=None, ld=None, key=0x42, name=None): + self.ctx = ctx + self.key = key + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values, name=name) + + def __add__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + result = self.empty_like() + with self.ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da, db, dresult = numba_arguments(t) + add_kernel[32, 16, nb_stream](da, db, dresult) + return result + + def __sub__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + result = self.empty_like() + with self.ctx.task(self.l.read(), other.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da, db, dresult = numba_arguments(t) + sub_kernel[32, 16, nb_stream](da, db, dresult) + return result + + def decrypt(self, num_operands=2): + """Decrypt by subtracting num_operands * key""" + result = self.empty_like() + total_key = (num_operands * self.key) & 0xFF + with self.ctx.task(self.l.read(), result.l.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + da, dresult = numba_arguments(t) + sub_scalar_kernel[32, 16, nb_stream](da, dresult, total_key) + return Plaintext(self.ctx, ld=result.l, key=self.key) + + def empty_like(self): + return Ciphertext(self.ctx, ld=self.l.empty_like()) + + +def circuit(a, b): + """Circuit: (A + B) + (B - A) = 2*B""" + return (a + b) + (b - a) + + +def test_fhe(): + """Test FHE using manual task creation with addition encryption.""" + ctx = stf.context(use_graph=False) + + vA = [3, 3, 2, 2, 17] + pA = Plaintext(ctx, vA, name="A") + + vB = [1, 7, 7, 7, 49] + pB = Plaintext(ctx, vB, name="B") + + expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)] + + eA = pA.encrypt() + eB = pB.encrypt() + encrypted_out = circuit(eA, eB) + decrypted_out = encrypted_out.decrypt(num_operands=2) + + with ctx.task( + stf.exec_place.host(), decrypted_out.l.read(stf.data_place.managed()) + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() + hvalues = numba_arguments(t) + actual = [int(v) for v in hvalues] + + ctx.finalize() + + assert actual == expected, ( + f"Decrypted result {actual} doesn't match expected {expected}" + ) + + +if __name__ == "__main__": + test_fhe() diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py new file mode 100644 index 00000000000..0d1eb64d8bb --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py @@ -0,0 +1,135 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +# Toy Fully Homomorphic Encryption (FHE) example with addition encryption + +import numba +from numba import cuda +from numba_decorator import jit +from numba_helpers import numba_arguments + +import cuda.stf as stf + +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +class Plaintext: + def __init__(self, ctx, values=None, ld=None, key=0x42, name=None): + self.ctx = ctx + self.key = key + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values, name=name) + + def encrypt(self) -> "Ciphertext": + encrypted = bytearray([(c + self.key) & 0xFF for c in self.values]) + return Ciphertext(self.ctx, values=encrypted, key=self.key) + + def print_values(self): + with self.ctx.task( + stf.exec_place.host(), self.l.read(stf.data_place.managed()) + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() + hvalues = numba_arguments(t) + print([v for v in hvalues]) + + +@jit +def add_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = (a[i] + b[i]) & 0xFF + + +@jit +def sub_kernel(a, b, out): + i = cuda.grid(1) + if i < out.size: + out[i] = (a[i] - b[i]) & 0xFF + + +@jit +def sub_scalar_kernel(a, out, v): + i = cuda.grid(1) + if i < out.size: + out[i] = (a[i] - v) & 0xFF + + +class Ciphertext: + def __init__(self, ctx, values=None, ld=None, key=0x42, name=None): + self.ctx = ctx + self.key = key + if ld is not None: + self.l = ld + if values is not None: + self.values = bytearray(values) + self.l = ctx.logical_data(self.values, name=name) + + def __add__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + result = self.empty_like() + add_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) + return result + + def __sub__(self, other): + if not isinstance(other, Ciphertext): + return NotImplemented + result = self.empty_like() + sub_kernel[32, 16](self.l.read(), other.l.read(), result.l.write()) + return result + + def decrypt(self, num_operands=2): + """Decrypt by subtracting num_operands * key""" + result = self.empty_like() + total_key = (num_operands * self.key) & 0xFF + sub_scalar_kernel[32, 16](self.l.read(), result.l.write(), total_key) + return Plaintext(self.ctx, ld=result.l, key=self.key) + + def empty_like(self): + return Ciphertext(self.ctx, ld=self.l.empty_like()) + + +def circuit(a, b): + """Circuit: (A + B) + (B - A) = 2*B""" + return (a + b) + (b - a) + + +def test_fhe_decorator(): + """Test FHE using @jit (numba_decorator) with addition encryption.""" + ctx = stf.context(use_graph=False) + + vA = [3, 3, 2, 2, 17] + pA = Plaintext(ctx, vA, name="A") + + vB = [1, 7, 7, 7, 49] + pB = Plaintext(ctx, vB, name="B") + + expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)] + + eA = pA.encrypt() + eB = pB.encrypt() + encrypted_out = circuit(eA, eB) + decrypted_out = encrypted_out.decrypt(num_operands=2) + + with ctx.task( + stf.exec_place.host(), decrypted_out.l.read(stf.data_place.managed()) + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + nb_stream.synchronize() + hvalues = numba_arguments(t) + actual = [int(v) for v in hvalues] + + ctx.finalize() + + assert actual == expected, ( + f"Decrypted result {actual} doesn't match expected {expected}" + ) + + +if __name__ == "__main__": + test_fhe_decorator() diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py new file mode 100644 index 00000000000..daac131c54b --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_numba.py @@ -0,0 +1,254 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +import numba +import numpy as np +import pytest +from numba import cuda +from numba_helpers import get_arg_numba, numba_arguments + +import cuda.stf as stf + +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +@cuda.jit +def axpy(a, x, y): + i = cuda.grid(1) + if i < x.size: + y[i] = a * x[i] + y[i] + + +@cuda.jit +def scale(a, x): + i = cuda.grid(1) + if i < x.size: + x[i] = a * x[i] + + +# One test with a single kernel in a CUDA graph +def test_numba_graph(): + X = np.ones(16, dtype=np.float32) + ctx = stf.context(use_graph=True) + lX = ctx.logical_data(X) + with ctx.task(lX.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = numba_arguments(t) + scale[32, 64, nb_stream](2.0, dX) + + ctx.finalize() + + # Verify results after finalize (data written back to host) + # Expected: scale(2.0, 1.0) = 2.0 + assert np.allclose(X, 2.0) + + +def test_numba(): + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + Z = np.ones(n, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + threads_per_block = 256 + blocks = (n + threads_per_block - 1) // threads_per_block + + with ctx.task(lX.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = numba_arguments(t) + scale[blocks, threads_per_block, nb_stream](2.0, dX) + + with ctx.task(lX.read(), lY.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + dY = get_arg_numba(t, 1) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) + + with ctx.task(lX.read(), lZ.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX, dZ = numba_arguments(t) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dZ) + + with ctx.task(lY.read(), lZ.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dY, dZ = numba_arguments(t) + axpy[blocks, threads_per_block, nb_stream](2.0, dY, dZ) + + ctx.finalize() + + # Verify results after finalize (data written back to host) + # Expected values: + # X: scale(2.0, 1.0) = 2.0 + # Y: axpy(2.0, X=2.0, Y=1.0) = 2.0*2.0 + 1.0 = 5.0 + # Z: axpy(2.0, X=2.0, Z=1.0) = 5.0, then axpy(2.0, Y=5.0, Z=5.0) = 15.0 + assert np.allclose(X, 2.0) + assert np.allclose(Y, 5.0) + assert np.allclose(Z, 15.0) + + +@cuda.jit +def laplacian_5pt_kernel(u_in, u_out, dx, dy): + """ + Compute a 5-point Laplacian on u_in and write the result to u_out. + + Grid-stride 2-D kernel. Assumes C-contiguous (row-major) inputs. + Boundary cells are copied unchanged. + """ + coef_x = 1.0 / (dx * dx) + coef_y = 1.0 / (dy * dy) + + i, j = cuda.grid(2) # i <-> row (x-index), j <-> col (y-index) + nx, ny = u_in.shape + + if i >= nx or j >= ny: + return # out-of-bounds threads do nothing + + if 0 < i < nx - 1 and 0 < j < ny - 1: + u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + ( + u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1] + ) * coef_y + else: + # simple Dirichlet/Neumann placeholder: copy input to output + u_out[i, j] = u_in[i, j] + + +def test_numba2d(): + nx, ny = 1024, 1024 + dx = 2.0 * np.pi / (nx - 1) + dy = 2.0 * np.pi / (ny - 1) + + # a smooth test field: f(x,y) = sin(x) * cos(y) + x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64) + y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64) + + u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) + u_out = np.zeros_like(u) + + ctx = stf.context() + lu = ctx.logical_data(u) + lu_out = ctx.logical_data(u_out) + + with ctx.task(lu.read(), lu_out.write()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + du = get_arg_numba(t, 0) + du_out = get_arg_numba(t, 1) + threads_per_block = (16, 16) # 256 threads per block is a solid starting point + blocks_per_grid = ( + (nx + threads_per_block[0] - 1) // threads_per_block[0], + (ny + threads_per_block[1] - 1) // threads_per_block[1], + ) + laplacian_5pt_kernel[blocks_per_grid, threads_per_block, nb_stream]( + du, du_out, dx, dy + ) + + ctx.finalize() + + u_out_ref = np.zeros_like(u) + + for i in range(1, nx - 1): # skip boundaries + for j in range(1, ny - 1): + u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + ( + u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1] + ) / dy**2 + + # copy boundaries + u_out_ref[0, :] = u[0, :] + u_out_ref[-1, :] = u[-1, :] + u_out_ref[:, 0] = u[:, 0] + u_out_ref[:, -1] = u[:, -1] + + # compare with the GPU result + assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6) + + +def test_numba_exec_place(): + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(stf.exec_place.device(0), lX.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + scale[32, 64, nb_stream](2.0, dX) + + with ctx.task(stf.exec_place.device(0), lX.read(), lY.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + dY = get_arg_numba(t, 1) + axpy[32, 64, nb_stream](2.0, dX, dY) + + with ctx.task( + stf.exec_place.device(0), + lX.read(stf.data_place.managed()), + lZ.rw(stf.data_place.managed()), + ) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + dZ = get_arg_numba(t, 1) + axpy[32, 64, nb_stream](2.0, dX, dZ) + + with ctx.task(stf.exec_place.device(0), lY.read(), lZ.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dY = get_arg_numba(t, 0) + dZ = get_arg_numba(t, 1) + axpy[32, 64, nb_stream](2.0, dY, dZ) + + +def test_numba_places(): + if len(list(cuda.gpus)) < 2: + pytest.skip("Need at least 2 GPUs") + return + + X = np.ones(16, dtype=np.float32) + Y = np.ones(16, dtype=np.float32) + Z = np.ones(16, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(lX.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = numba_arguments(t) + scale[32, 64, nb_stream](2.0, dX) + + with ctx.task(lX.read(), lY.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + dY = get_arg_numba(t, 1) + axpy[32, 64, nb_stream](2.0, dX, dY) + + with ctx.task(stf.exec_place.device(1), lX.read(), lZ.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + dZ = get_arg_numba(t, 1) + axpy[32, 64, nb_stream](2.0, dX, dZ) + + with ctx.task(lY.read(), lZ.rw(stf.data_place.device(1))) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dY = get_arg_numba(t, 0) + dZ = get_arg_numba(t, 1) + axpy[32, 64, nb_stream](2.0, dY, dZ) + + ctx.finalize() + # Same expected values as test_numba (X=2, Y=5, Z=15) with multi-GPU placement + assert np.allclose(X, 2.0) + assert np.allclose(Y, 5.0) + assert np.allclose(Z, 15.0) + + +if __name__ == "__main__": + test_numba_graph() + # test_numba() diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py new file mode 100644 index 00000000000..ac03c27e00d --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_pytorch.py @@ -0,0 +1,118 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +import numpy as np +import pytest + +torch = pytest.importorskip("torch") + +from pytorch_task import ( # noqa: E402 + pytorch_task, + tensor_arg, + tensor_arguments, +) + +import cuda.stf as stf # noqa: E402 + + +def test_pytorch(): + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + Z = np.ones(n, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + with ctx.task(lX.rw()) as t: + torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) + with torch.cuda.stream(torch_stream): + tX = tensor_arguments(t) + tX[:] = tX * 2 # In-place multiplication + + with ctx.task(lX.read(), lY.write()) as t: + torch_stream = torch.cuda.ExternalStream(t.stream_ptr()) + with torch.cuda.stream(torch_stream): + tX = tensor_arg(t, 0) + tY = tensor_arg(t, 1) + tY[:] = tX * 2 # Copy result into tY tensor + + with ( + ctx.task(lX.read(), lZ.write()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): + tX, tZ = tensor_arguments(t) + tZ[:] = tX * 4 + 1 # Copy result into tZ tensor + + with ( + ctx.task(lY.read(), lZ.rw()) as t, + torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())), + ): + tY, tZ = tensor_arguments(t) + tZ[:] = tY * 2 - 3 # Copy result into tZ tensor + + ctx.finalize() + + # Verify results on host after finalize + # Expected values: + # X: 1.0 -> 2.0 (multiplied by 2) + # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0) + # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0) + assert np.allclose(X, 2.0) + assert np.allclose(Y, 4.0) + assert np.allclose(Z, 5.0) + + +def test_pytorch_task(): + """Test the pytorch_task functionality with simplified syntax""" + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + Z = np.ones(n, dtype=np.float32) + + ctx = stf.context() + + # Note: We could use ctx.logical_data_full instead of creating NumPy arrays first + # For example: lX = ctx.logical_data_full((n,), 1.0, dtype=np.float32) + # However, this would create logical data without underlying NumPy arrays, + # so we wouldn't be able to check results after ctx.finalize() in this test + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + lZ = ctx.logical_data(Z) + + # Equivalent operations to test_pytorch() but using pytorch_task syntax + + # In-place multiplication using pytorch_task (single tensor) + with pytorch_task(ctx, lX.rw()) as (tX,): + tX[:] = tX * 2 + + # Copy and multiply using pytorch_task (multiple tensors) + with pytorch_task(ctx, lX.read(), lY.write()) as (tX, tY): + tY[:] = tX * 2 + + # Another operation combining tensors + with pytorch_task(ctx, lX.read(), lZ.write()) as (tX, tZ): + tZ[:] = tX * 4 + 1 + + # Final operation with read-write access + with pytorch_task(ctx, lY.read(), lZ.rw()) as (tY, tZ): + tZ[:] = tY * 2 - 3 + + ctx.finalize() + + # Verify results on host after finalize (same as original test) + # Expected values: + # X: 1.0 -> 2.0 (multiplied by 2) + # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0) + # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0) + assert np.allclose(X, 2.0) + assert np.allclose(Y, 4.0) + assert np.allclose(Z, 5.0) + + +if __name__ == "__main__": + test_pytorch() diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py new file mode 100644 index 00000000000..d43c4e99e6e --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py @@ -0,0 +1,88 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import numba +import numpy as np +from numba import cuda +from numba_decorator import jit + +import cuda.stf as stf + +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +@jit +def laplacian_5pt_kernel(u_in, u_out, dx, dy): + """ + Compute a 5-point Laplacian on u_in and write the result to u_out. + + Grid-stride 2-D kernel. Assumes C-contiguous (row-major) inputs. + Boundary cells are copied unchanged. + """ + coef_x = 1.0 / (dx * dx) + coef_y = 1.0 / (dy * dy) + + i, j = cuda.grid(2) # i <-> row (x-index), j <-> col (y-index) + nx, ny = u_in.shape + + if i >= nx or j >= ny: + return # out-of-bounds threads do nothing + + if 0 < i < nx - 1 and 0 < j < ny - 1: + u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + ( + u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1] + ) * coef_y + else: + # simple Dirichlet/Neumann placeholder: copy input to output + u_out[i, j] = u_in[i, j] + + +def test_numba2d(): + nx, ny = 1024, 1024 + dx = 2.0 * np.pi / (nx - 1) + dy = 2.0 * np.pi / (ny - 1) + + # a smooth test field: f(x,y) = sin(x) * cos(y) + x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64) + y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64) + + u = np.sin(x)[:, None] * np.cos(y)[None, :] # shape = (nx, ny) + u_out = np.zeros_like(u) + + ctx = stf.context() + lu = ctx.logical_data(u) + lu_out = ctx.logical_data(u_out) + + threads_per_block = (16, 16) # 256 threads per block is a solid starting point + blocks_per_grid = ( + (nx + threads_per_block[0] - 1) // threads_per_block[0], + (ny + threads_per_block[1] - 1) // threads_per_block[1], + ) + + laplacian_5pt_kernel[blocks_per_grid, threads_per_block]( + lu.read(), lu_out.write(), dx, dy + ) + + ctx.finalize() + + u_out_ref = np.zeros_like(u) + + for i in range(1, nx - 1): # skip boundaries + for j in range(1, ny - 1): + u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + ( + u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1] + ) / dy**2 + + # copy boundaries + u_out_ref[0, :] = u[0, :] + u_out_ref[-1, :] = u[-1, :] + u_out_ref[:, 0] = u[:, 0] + u_out_ref[:, -1] = u[:, -1] + + # compare with the GPU result + assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6) + + +if __name__ == "__main__": + test_numba2d() diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py new file mode 100644 index 00000000000..e203c0357a7 --- /dev/null +++ b/python/cuda_cccl/tests/stf/test_token.py @@ -0,0 +1,81 @@ +# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import numba +import numpy as np +from numba import cuda +from numba_helpers import get_arg_numba, numba_arguments + +import cuda.stf as stf + +numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0 + + +def test_token(): + ctx = stf.context() + lX = ctx.token() + lY = ctx.token() + lZ = ctx.token() + + with ctx.task(lX.rw()): + pass + + with ctx.task(lX.read(), lY.rw()): + pass + + with ctx.task(lX.read(), lZ.rw()): + pass + + with ctx.task(lY.read(), lZ.rw()): + pass + + ctx.finalize() + + +@cuda.jit +def axpy(a, x, y): + start = cuda.grid(1) + stride = cuda.gridsize(1) + for i in range(start, x.size, stride): + y[i] = a * x[i] + y[i] + + +def test_numba_token(): + n = 1024 * 1024 + X = np.ones(n, dtype=np.float32) + Y = np.ones(n, dtype=np.float32) + + ctx = stf.context() + lX = ctx.logical_data(X) + lY = ctx.logical_data(Y) + token = ctx.token() + + # Use a reasonable grid size - kernel loop will handle all elements + blocks = 32 + threads_per_block = 256 + + with ctx.task(lX.read(), lY.rw(), token.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX = get_arg_numba(t, 0) + dY = get_arg_numba(t, 1) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) + + with ctx.task(lX.read(), lY.rw(), token.rw()) as t: + nb_stream = cuda.external_stream(t.stream_ptr()) + dX, dY = numba_arguments(t) + axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY) + + ctx.finalize() + + # Sanity checks: verify the results after finalize + # First task: Y = 2.0 * X + Y = 2.0 * 1.0 + 1.0 = 3.0 + # Second task: Y = 2.0 * X + Y = 2.0 * 1.0 + 3.0 = 5.0 + assert np.allclose(X, 1.0), f"X should still be 1.0 (read-only), but got {X[0]}" + assert np.allclose(Y, 5.0), ( + f"Y should be 5.0 after two axpy operations, but got {Y[0]}" + ) + + +if __name__ == "__main__": + test_token()