NVIDIA · caugonnet · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
@@ -325,6 +325,8 @@ workflows:
   exclude:
     # GPU runners are not available on Windows.
     - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
+     # STF C API and Python bindings are not built for MSVC:
+    - {jobs: ['test_py_stf'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
     # cudax doesn't support C++17 on msvc:
     - {project: 'cudax', std: 17, cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
 
@@ -478,6 +480,7 @@ jobs:
   test_py_coop:     { name: "Test cuda.coop",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
+  test_py_stf:      { name: "Test cuda.stf",               gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_stf'} }
 
   # Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
   run_cpu: { gpu: false }
@@ -535,7 +538,7 @@ projects:
     name: "Python"
     job_map:
       build: ['build_py_wheel']
-      test:  ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples']
+      test:  ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples', 'test_py_stf']
   cccl_c_parallel:
     name: 'CCCL C Parallel'
     stds: [20]

@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$ci_dir/pyenv_helper.sh"
+
+# Parse common arguments
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)
+
+# Setup Python environment
+setup_python_env "${py_version}"
+
+# Fetch or build the cuda_cccl wheel:
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
+  "$ci_dir/util/artifacts/download.sh" ${wheel_artifact_name} /home/coder/cccl/
+else
+  "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
+fi
+
+# Install cuda_cccl
+CUDA_CCCL_WHEEL_PATH="$(ls /home/coder/cccl/wheelhouse/cuda_cccl-*.whl)"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]"
+
+# Run tests for STF module
+cd "/home/coder/cccl/python/cuda_cccl/tests/"
+python -m pytest -n auto -v stf/
@@ -10,6 +10,7 @@ set(
   08-cub-reduce.cu
   axpy-annotated.cu
   void_data_interface.cu
+  thrust_device_data_place_allocator.cu
   explicit_data_places.cu
   thrust_zip_iterator.cu
   1f1b.cu

@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Example: Thrust device_vector with an allocator backed by a data_place.
+ *        Uses thrust::mr::memory_resource to wrap data_place, then
+ *        thrust::mr::allocator to create a compatible allocator.
+ *        Storage is allocated via data_place::allocate (device, composite/VMM,
+ *        or other place types). The same Thrust code works unchanged for
+ *        single-device, multi-device (VMM), or green-context placement.
+ */
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/memory_resource.h>
+#include <thrust/transform.h>
+
+#include <cuda/experimental/__stf/places/blocked_partition.cuh>
+#include <cuda/experimental/__stf/places/exec/green_context.cuh>
+#include <cuda/experimental/stf.cuh>
+
+#include <iostream>
+
+using namespace cuda::experimental::stf;
+
+// Minimal adapter: data_place is STF's abstraction; Thrust expects a
+// memory_resource. This class bridges the two. The resource must outlive
+// any vectors/allocators that use it.
+class data_place_memory_resource final : public thrust::mr::memory_resource<thrust::device_ptr<void>>
+{
+public:
+  explicit data_place_memory_resource(const data_place& place)
+      : place_(place)
+  {}
+
+  pointer do_allocate(std::size_t bytes, std::size_t /*alignment*/) override
+  {
+    void* raw = place_.allocate(static_cast<std::ptrdiff_t>(bytes));
+    return thrust::device_ptr<void>(raw);
+  }
+
+  void do_deallocate(pointer p, std::size_t bytes, std::size_t /*alignment*/) override
+  {
+    place_.deallocate(p.get(), bytes);
+  }
+
+  bool do_is_equal(const memory_resource& other) const noexcept override
+  {
+    auto* o = dynamic_cast<const data_place_memory_resource*>(&other);
+    return o && place_ == o->place_;
+  }
+
+private:
+  data_place place_;
+};
+
+template <typename T>
+using data_place_allocator = thrust::mr::allocator<T, data_place_memory_resource>;
+
+// Run the Thrust example with the given data_place; returns true if the check passed.
+bool run_with_place(const data_place& place, const char* label)
+{
+  const size_t n = 1024 * 1024;
+
+  data_place_memory_resource memres(place);
+  data_place_allocator<double> alloc(&memres);
+  thrust::device_vector<double, data_place_allocator<double>> d_vec(n, 0.0, alloc);
+
+  thrust::transform(
+    thrust::device,
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator<size_t>(n),
+    d_vec.begin(),
+    [] _CCCL_DEVICE(size_t i) {
+      return 2.0 * static_cast<double>(i);
+    });
+
+  thrust::host_vector<double> h_sample(4);
+  thrust::copy(d_vec.begin(), d_vec.begin() + 4, h_sample.begin());
+
+  bool ok = (h_sample[0] == 0.0 && h_sample[1] == 2.0 && h_sample[2] == 4.0 && h_sample[3] == 6.0);
+  if (!ok)
+  {
+    std::cerr << "thrust_device_data_place_allocator: " << label << " (" << place.to_string() << "): FAILED\n";
+  }
+  return ok;
+}
+
+int main()
+{
+  bool all_ok = true;
+
+  // Device 0
+  all_ok &= run_with_place(data_place::device(0), "device(0)");
+
+  // All devices (composite, VMM path when multiple devices)
+  all_ok &= run_with_place(data_place::composite(blocked_partition(), exec_place::all_devices()),
+                           "composite(blocked_partition, all_devices)");
+
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  // Example based on a grid of green contexts where we use a data place per green context
+  {
+    const int num_sms = 8;
+    const int dev_id  = 0;
+    green_context_helper gc_helper(num_sms, dev_id);
+    if (gc_helper.get_count() >= 1)
+    {
+      auto where     = gc_helper.get_grid(true);
+      data_place cdp = data_place::composite(blocked_partition(), where);
+      all_ok &= run_with_place(cdp, "composite(blocked_partition, green_context_grid)");
+    }
+  }
+#endif
+
+  return all_ok ? 0 : 1;
+}
@@ -49,5 +49,9 @@ int main()
     return cuda_kernel_desc{dummy_kernel, 16, 128, 0};
   };
 
+  EXPECT(token.is_void_interface());
+  EXPECT(token2.is_void_interface());
+  EXPECT(token3.is_void_interface());
+
   ctx.finalize();
 }
@@ -31,6 +31,7 @@
 #include <cuda/experimental/__stf/utility/memory.cuh>
 #include <cuda/experimental/__stf/utility/traits.cuh>
 
+#include <array>
 #include <list>
 #include <random>
 #include <unordered_map>
@@ -87,23 +88,24 @@ public:
       , data_dims(data_dims)
       , elemsize(elemsize)
   {
+    // Ensure a current CUDA context exists so cuCtxGetDevice() and other driver
+    // APIs succeed (e.g. when no stream_ctx was used or after primary ctx release).
+    cuda_safe_call(cudaFree(nullptr));
+
     // Regardless of the grid, we allow all devices to access that localized array
     const int ndevs = cuda_try<cudaGetDeviceCount>();
     CUdevice dev    = cuda_try<cuCtxGetDevice>();
 
     /* Check whether the current device supports UVA */
     int supportsVMM = cuda_try<cuDeviceGetAttribute>(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev);
-    //        fprintf(stderr, "VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED ? %d\n", supportsVMM);
     EXPECT(supportsVMM == 1, "Cannot create a localized_array object on this machine because it does not support VMM.");
 
     /* Get allocation granularity */
-
     CUmemAllocationProp prop = {};
     prop.type                = CU_MEM_ALLOCATION_TYPE_PINNED;
     prop.location            = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = dev};
 
     size_t alloc_granularity_bytes = cuda_try<cuMemGetAllocationGranularity>(&prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
-    //        fprintf(stderr, "GRANULARITY = %ld KB\n", alloc_granularity_bytes / 1024);
 
     // To make our life simpler for now: we assume that we only allocate full blocks
     block_size_bytes = alloc_granularity_bytes;
@@ -117,9 +119,6 @@ public:
     // Reserve a range of virtual addresses, round up size to accommodate granularity requirements
     cuda_try(cuMemAddressReserve(&base_ptr, vm_total_size_bytes, 0ULL, 0ULL, 0ULL));
 
-    // fprintf(stderr, "cuMemAddressReserve => %p + %ld (%ld KB)\n", (void *)base_ptr, vm_total_size_bytes,
-    //                 vm_total_size_bytes / 1024);
-
     ::std::vector<CUmemAccessDesc> accessDesc(ndevs);
     for (int d = 0; d < ndevs; d++)
     {
@@ -219,7 +218,6 @@ public:
 
       // Print visual block map (compact representation)
       fprintf(stderr, "\nBlock ownership map (each char = 1 block, 0-9/a-z = place index):\n  ");
-      // Build a map of place names to single-char indices
       ::std::unordered_map<::std::string, char> place_to_char;
       char next_char = '0';
       for (size_t i = 0; i < nblocks; i++)
@@ -245,7 +243,6 @@ public:
       }
       fprintf(stderr, "\n");
 
-      // Print legend
       fprintf(stderr, "\n  Legend:\n");
       for (const auto& entry : place_to_char)
       {
@@ -255,10 +252,7 @@ public:
       fprintf(stderr, "==============================================\n\n");
     }
 
-    // fprintf(stderr, "GOT %ld effective blocks (%ld blocks)\n", nblocks_effective, nblocks);
-
-    // Create a physical allocation per block, this is not mapped in
-    // virtual memory yet.
+    // Create a physical allocation per block, this is not mapped in virtual memory yet.
     for (auto& item : meta)
     {
       int item_dev = device_ordinal(item.place);
@@ -289,7 +283,6 @@ public:
         }
       }
     }
-    // fprintf(stderr, "localized_array (this = %p) : nblocks_effective %ld\n", this, nblocks_effective);
   }
 
   localized_array()                                  = delete;
@@ -300,8 +293,6 @@ public:
 
   ~localized_array()
   {
-    // fprintf(stderr, "~localized_array (this = %p) ... base ptr %p vm_total_size_bytes %ld - nblocks_effective
-    // %ld\n", this, (void *)base_ptr, vm_total_size_bytes, nblocks_effective);
     for (auto& item : meta)
     {
       size_t offset = item.offset;
@@ -404,8 +395,6 @@ private:
     stats.total_samples += nsamples;
     stats.matching_samples += max_cnt;
 
-    // ::std::cout << "GOT BEST POS for offset " << linearized_index << " -> " << max_pos.string() << ::std::endl;
-
     return max_pos;
 #endif
   }
@@ -486,4 +475,34 @@ public:
 private:
   reserved::linear_pool<localized_array> cache;
 };
+
+// Registry for composite data_place::allocate/deallocate (ownership of localized_array by base pointer)
+// This is how we can retrieve the localized_array descriptor when calling
+// deallocate with the device address returned by allocate.
+inline ::std::unordered_map<void*, ::std::unique_ptr<localized_array>>& get_composite_alloc_registry()
+{
+  static ::std::unordered_map<void*, ::std::unique_ptr<localized_array>> reg;
+  return reg;
+}
+
+inline void* allocate_composite_data_place(const data_place& p, ::std::ptrdiff_t size)
+{
+  EXPECT(p.is_composite());
+  const size_t size_u               = static_cast<size_t>(size);
+  const exec_place_grid& grid       = p.get_grid();
+  const get_executor_func_t& mapper = p.get_partitioner();
+  auto delinearize_1d               = [](size_t i) {
+    return pos4(static_cast<ssize_t>(i), 0, 0, 0);
+  };
+  auto arr  = ::std::make_unique<localized_array>(grid, mapper, delinearize_1d, size_u, 1, dim4(size_u));
+  void* ptr = arr->get_base_ptr();
+  get_composite_alloc_registry()[ptr] = ::std::move(arr);
+  return ptr;
+}
+
+inline void deallocate_composite_data_place(void* ptr)
+{
+  // Cleanup of the actual array resources (VMM resources) is handled in the destructor of localized_array.
+  get_composite_alloc_registry().erase(ptr);
+}
 } // end namespace cuda::experimental::stf::reserved
@@ -253,7 +253,7 @@ public:
     return ctxs[partition];
   }
 
-  green_ctx_view get_view(size_t id)
+  green_ctx_view get_view(size_t id) const
   {
     return green_ctx_view(ctxs[id], pools[id], devid);
   }
@@ -269,6 +269,26 @@ public:
     return ctxs.size();
   }
 
+  /** @brief Build a grid of exec places from this helper's green contexts.
+   *
+   * The green contexts are already created by the helper; this only chooses how
+   * each exec place's affine data place is represented.
+   *
+   * @param use_green_ctx_data_place If true, each place's affine data place is the
+   *        green context extension; if false, the default device data place.
+   * @return exec_place_grid of green context places.
+   */
+  exec_place_grid get_grid(bool use_green_ctx_data_place = false) const
+  {
+    ::std::vector<exec_place> places;
+    places.reserve(ctxs.size());
+    for (size_t i = 0; i < ctxs.size(); i++)
+    {
+      places.push_back(exec_place::green_ctx(get_view(i), use_green_ctx_data_place));
+    }
+    return make_grid(mv(places));
+  }
+
 private:
   friend class exec_place;