diff --git a/ci/matrix.yaml b/ci/matrix.yaml
index 237f6a2f681..4acbb308f90 100644
--- a/ci/matrix.yaml
+++ b/ci/matrix.yaml
@@ -325,6 +325,8 @@ workflows:
   exclude:
     # GPU runners are not available on Windows.
     - {jobs: ['test', 'test_gpu', 'test_nolid', 'test_lid0', 'test_lid1', 'test_lid2'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
+     # STF C API and Python bindings are not built for MSVC:
+    - {jobs: ['test_py_stf'], cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
     # cudax doesn't support C++17 on msvc:
     - {project: 'cudax', std: 17, cxx: ['msvc2019', 'msvc14.39', 'msvc2022']}
 
@@ -478,6 +480,7 @@ jobs:
   test_py_coop:     { name: "Test cuda.coop",  gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_coop'} }
   test_py_par:      { name: "Test cuda.compute",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_compute'} }
   test_py_examples: { name: "Test cuda.cccl.examples",     gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_cccl_examples'} }
+  test_py_stf:      { name: "Test cuda.stf",               gpu: true,  needs: 'build_py_wheel', force_producer_ctk: "pybuild", invoke: { prefix: 'test_cuda_stf'} }
 
   # Run jobs for 'target' project (ci/util/build_and_test_targets.sh):
   run_cpu: { gpu: false }
@@ -535,7 +538,7 @@ projects:
     name: "Python"
     job_map:
       build: ['build_py_wheel']
-      test:  ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples']
+      test:  ['test_py_headers', 'test_py_coop', 'test_py_par', 'test_py_examples', 'test_py_stf']
   cccl_c_parallel:
     name: 'CCCL C Parallel'
     stds: [20]
diff --git a/ci/test_cuda_stf_python.sh b/ci/test_cuda_stf_python.sh
new file mode 100755
index 00000000000..c686c3859aa
--- /dev/null
+++ b/ci/test_cuda_stf_python.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+set -euo pipefail
+
+ci_dir="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+source "$ci_dir/pyenv_helper.sh"
+
+# Parse common arguments
+source "$ci_dir/util/python/common_arg_parser.sh"
+parse_python_args "$@"
+cuda_major_version=$(nvcc --version | grep release | awk '{print $6}' | tr -d ',' | cut -d '.' -f 1 | cut -d 'V' -f 2)
+
+# Setup Python environment
+setup_python_env "${py_version}"
+
+# Fetch or build the cuda_cccl wheel:
+if [[ -n "${GITHUB_ACTIONS:-}" ]]; then
+  wheel_artifact_name=$("$ci_dir/util/workflow/get_wheel_artifact_name.sh")
+  "$ci_dir/util/artifacts/download.sh" ${wheel_artifact_name} /home/coder/cccl/
+else
+  "$ci_dir/build_cuda_cccl_python.sh" -py-version "${py_version}"
+fi
+
+# Install cuda_cccl
+CUDA_CCCL_WHEEL_PATH="$(ls /home/coder/cccl/wheelhouse/cuda_cccl-*.whl)"
+python -m pip install "${CUDA_CCCL_WHEEL_PATH}[test-cu${cuda_major_version}]"
+
+# Run tests for STF module
+cd "/home/coder/cccl/python/cuda_cccl/tests/"
+python -m pytest -n auto -v stf/
diff --git a/cudax/examples/stf/CMakeLists.txt b/cudax/examples/stf/CMakeLists.txt
index 48c52ca69bc..be43e9129ab 100644
--- a/cudax/examples/stf/CMakeLists.txt
+++ b/cudax/examples/stf/CMakeLists.txt
@@ -10,6 +10,7 @@ set(
   08-cub-reduce.cu
   axpy-annotated.cu
   void_data_interface.cu
+  thrust_device_data_place_allocator.cu
   explicit_data_places.cu
   thrust_zip_iterator.cu
   1f1b.cu
diff --git a/cudax/examples/stf/thrust_device_data_place_allocator.cu b/cudax/examples/stf/thrust_device_data_place_allocator.cu
new file mode 100644
index 00000000000..34f8cea13a9
--- /dev/null
+++ b/cudax/examples/stf/thrust_device_data_place_allocator.cu
@@ -0,0 +1,130 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Example: Thrust device_vector with an allocator backed by a data_place.
+ *        Uses thrust::mr::memory_resource to wrap data_place, then
+ *        thrust::mr::allocator to create a compatible allocator.
+ *        Storage is allocated via data_place::allocate (device, composite/VMM,
+ *        or other place types). The same Thrust code works unchanged for
+ *        single-device, multi-device (VMM), or green-context placement.
+ */
+
+#include <thrust/copy.h>
+#include <thrust/device_ptr.h>
+#include <thrust/device_vector.h>
+#include <thrust/execution_policy.h>
+#include <thrust/host_vector.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/mr/allocator.h>
+#include <thrust/mr/memory_resource.h>
+#include <thrust/transform.h>
+
+#include <cuda/experimental/__stf/places/blocked_partition.cuh>
+#include <cuda/experimental/__stf/places/exec/green_context.cuh>
+#include <cuda/experimental/stf.cuh>
+
+#include <iostream>
+
+using namespace cuda::experimental::stf;
+
+// Minimal adapter: data_place is STF's abstraction; Thrust expects a
+// memory_resource. This class bridges the two. The resource must outlive
+// any vectors/allocators that use it.
+class data_place_memory_resource final : public thrust::mr::memory_resource<thrust::device_ptr<void>>
+{
+public:
+  explicit data_place_memory_resource(const data_place& place)
+      : place_(place)
+  {}
+
+  pointer do_allocate(std::size_t bytes, std::size_t /*alignment*/) override
+  {
+    void* raw = place_.allocate(static_cast<std::ptrdiff_t>(bytes));
+    return thrust::device_ptr<void>(raw);
+  }
+
+  void do_deallocate(pointer p, std::size_t bytes, std::size_t /*alignment*/) override
+  {
+    place_.deallocate(p.get(), bytes);
+  }
+
+  bool do_is_equal(const memory_resource& other) const noexcept override
+  {
+    auto* o = dynamic_cast<const data_place_memory_resource*>(&other);
+    return o && place_ == o->place_;
+  }
+
+private:
+  data_place place_;
+};
+
+template <typename T>
+using data_place_allocator = thrust::mr::allocator<T, data_place_memory_resource>;
+
+// Run the Thrust example with the given data_place; returns true if the check passed.
+bool run_with_place(const data_place& place, const char* label)
+{
+  const size_t n = 1024 * 1024;
+
+  data_place_memory_resource memres(place);
+  data_place_allocator<double> alloc(&memres);
+  thrust::device_vector<double, data_place_allocator<double>> d_vec(n, 0.0, alloc);
+
+  thrust::transform(
+    thrust::device,
+    thrust::counting_iterator<size_t>(0),
+    thrust::counting_iterator<size_t>(n),
+    d_vec.begin(),
+    [] _CCCL_DEVICE(size_t i) {
+      return 2.0 * static_cast<double>(i);
+    });
+
+  thrust::host_vector<double> h_sample(4);
+  thrust::copy(d_vec.begin(), d_vec.begin() + 4, h_sample.begin());
+
+  bool ok = (h_sample[0] == 0.0 && h_sample[1] == 2.0 && h_sample[2] == 4.0 && h_sample[3] == 6.0);
+  if (!ok)
+  {
+    std::cerr << "thrust_device_data_place_allocator: " << label << " (" << place.to_string() << "): FAILED\n";
+  }
+  return ok;
+}
+
+int main()
+{
+  bool all_ok = true;
+
+  // Device 0
+  all_ok &= run_with_place(data_place::device(0), "device(0)");
+
+  // All devices (composite, VMM path when multiple devices)
+  all_ok &= run_with_place(data_place::composite(blocked_partition(), exec_place::all_devices()),
+                           "composite(blocked_partition, all_devices)");
+
+#if _CCCL_CTK_AT_LEAST(12, 4)
+  // Example based on a grid of green contexts where we use a data place per green context
+  {
+    const int num_sms = 8;
+    const int dev_id  = 0;
+    green_context_helper gc_helper(num_sms, dev_id);
+    if (gc_helper.get_count() >= 1)
+    {
+      auto where     = gc_helper.get_grid(true);
+      data_place cdp = data_place::composite(blocked_partition(), where);
+      all_ok &= run_with_place(cdp, "composite(blocked_partition, green_context_grid)");
+    }
+  }
+#endif
+
+  return all_ok ? 0 : 1;
+}
diff --git a/cudax/examples/stf/void_data_interface.cu b/cudax/examples/stf/void_data_interface.cu
index 0340b16bf4a..bf429f23dfe 100644
--- a/cudax/examples/stf/void_data_interface.cu
+++ b/cudax/examples/stf/void_data_interface.cu
@@ -49,5 +49,9 @@ int main()
     return cuda_kernel_desc{dummy_kernel, 16, 128, 0};
   };
 
+  EXPECT(token.is_void_interface());
+  EXPECT(token2.is_void_interface());
+  EXPECT(token3.is_void_interface());
+
   ctx.finalize();
 }
diff --git a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
index 86275b4e8ce..0bf6c8bd9f9 100644
--- a/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
+++ b/cudax/include/cuda/experimental/__stf/localization/composite_slice.cuh
@@ -31,6 +31,7 @@
 #include <cuda/experimental/__stf/utility/memory.cuh>
 #include <cuda/experimental/__stf/utility/traits.cuh>
 
+#include <array>
 #include <list>
 #include <random>
 #include <unordered_map>
@@ -87,23 +88,24 @@ public:
       , data_dims(data_dims)
       , elemsize(elemsize)
   {
+    // Ensure a current CUDA context exists so cuCtxGetDevice() and other driver
+    // APIs succeed (e.g. when no stream_ctx was used or after primary ctx release).
+    cuda_safe_call(cudaFree(nullptr));
+
     // Regardless of the grid, we allow all devices to access that localized array
     const int ndevs = cuda_try<cudaGetDeviceCount>();
     CUdevice dev    = cuda_try<cuCtxGetDevice>();
 
     /* Check whether the current device supports UVA */
     int supportsVMM = cuda_try<cuDeviceGetAttribute>(CU_DEVICE_ATTRIBUTE_VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED, dev);
-    //        fprintf(stderr, "VIRTUAL_ADDRESS_MANAGEMENT_SUPPORTED ? %d\n", supportsVMM);
     EXPECT(supportsVMM == 1, "Cannot create a localized_array object on this machine because it does not support VMM.");
 
     /* Get allocation granularity */
-
     CUmemAllocationProp prop = {};
     prop.type                = CU_MEM_ALLOCATION_TYPE_PINNED;
     prop.location            = {.type = CU_MEM_LOCATION_TYPE_DEVICE, .id = dev};
 
     size_t alloc_granularity_bytes = cuda_try<cuMemGetAllocationGranularity>(&prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM);
-    //        fprintf(stderr, "GRANULARITY = %ld KB\n", alloc_granularity_bytes / 1024);
 
     // To make our life simpler for now: we assume that we only allocate full blocks
     block_size_bytes = alloc_granularity_bytes;
@@ -117,9 +119,6 @@ public:
     // Reserve a range of virtual addresses, round up size to accommodate granularity requirements
     cuda_try(cuMemAddressReserve(&base_ptr, vm_total_size_bytes, 0ULL, 0ULL, 0ULL));
 
-    // fprintf(stderr, "cuMemAddressReserve => %p + %ld (%ld KB)\n", (void *)base_ptr, vm_total_size_bytes,
-    //                 vm_total_size_bytes / 1024);
-
     ::std::vector<CUmemAccessDesc> accessDesc(ndevs);
     for (int d = 0; d < ndevs; d++)
     {
@@ -219,7 +218,6 @@ public:
 
       // Print visual block map (compact representation)
       fprintf(stderr, "\nBlock ownership map (each char = 1 block, 0-9/a-z = place index):\n  ");
-      // Build a map of place names to single-char indices
       ::std::unordered_map<::std::string, char> place_to_char;
       char next_char = '0';
       for (size_t i = 0; i < nblocks; i++)
@@ -245,7 +243,6 @@ public:
       }
       fprintf(stderr, "\n");
 
-      // Print legend
       fprintf(stderr, "\n  Legend:\n");
       for (const auto& entry : place_to_char)
       {
@@ -255,10 +252,7 @@ public:
       fprintf(stderr, "==============================================\n\n");
     }
 
-    // fprintf(stderr, "GOT %ld effective blocks (%ld blocks)\n", nblocks_effective, nblocks);
-
-    // Create a physical allocation per block, this is not mapped in
-    // virtual memory yet.
+    // Create a physical allocation per block, this is not mapped in virtual memory yet.
     for (auto& item : meta)
     {
       int item_dev = device_ordinal(item.place);
@@ -289,7 +283,6 @@ public:
         }
       }
     }
-    // fprintf(stderr, "localized_array (this = %p) : nblocks_effective %ld\n", this, nblocks_effective);
   }
 
   localized_array()                                  = delete;
@@ -300,8 +293,6 @@ public:
 
   ~localized_array()
   {
-    // fprintf(stderr, "~localized_array (this = %p) ... base ptr %p vm_total_size_bytes %ld - nblocks_effective
-    // %ld\n", this, (void *)base_ptr, vm_total_size_bytes, nblocks_effective);
     for (auto& item : meta)
     {
       size_t offset = item.offset;
@@ -404,8 +395,6 @@ private:
     stats.total_samples += nsamples;
     stats.matching_samples += max_cnt;
 
-    // ::std::cout << "GOT BEST POS for offset " << linearized_index << " -> " << max_pos.string() << ::std::endl;
-
     return max_pos;
 #endif
   }
@@ -486,4 +475,34 @@ public:
 private:
   reserved::linear_pool<localized_array> cache;
 };
+
+// Registry for composite data_place::allocate/deallocate (ownership of localized_array by base pointer)
+// This is how we can retrieve the localized_array descriptor when calling
+// deallocate with the device address returned by allocate.
+inline ::std::unordered_map<void*, ::std::unique_ptr<localized_array>>& get_composite_alloc_registry()
+{
+  static ::std::unordered_map<void*, ::std::unique_ptr<localized_array>> reg;
+  return reg;
+}
+
+inline void* allocate_composite_data_place(const data_place& p, ::std::ptrdiff_t size)
+{
+  EXPECT(p.is_composite());
+  const size_t size_u               = static_cast<size_t>(size);
+  const exec_place_grid& grid       = p.get_grid();
+  const get_executor_func_t& mapper = p.get_partitioner();
+  auto delinearize_1d               = [](size_t i) {
+    return pos4(static_cast<ssize_t>(i), 0, 0, 0);
+  };
+  auto arr  = ::std::make_unique<localized_array>(grid, mapper, delinearize_1d, size_u, 1, dim4(size_u));
+  void* ptr = arr->get_base_ptr();
+  get_composite_alloc_registry()[ptr] = ::std::move(arr);
+  return ptr;
+}
+
+inline void deallocate_composite_data_place(void* ptr)
+{
+  // Cleanup of the actual array resources (VMM resources) is handled in the destructor of localized_array.
+  get_composite_alloc_registry().erase(ptr);
+}
 } // end namespace cuda::experimental::stf::reserved
diff --git a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
index 2ce9691a0ba..8e55bd287a6 100644
--- a/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/exec/green_context.cuh
@@ -253,7 +253,7 @@ public:
     return ctxs[partition];
   }
 
-  green_ctx_view get_view(size_t id)
+  green_ctx_view get_view(size_t id) const
   {
     return green_ctx_view(ctxs[id], pools[id], devid);
   }
@@ -269,6 +269,26 @@ public:
     return ctxs.size();
   }
 
+  /** @brief Build a grid of exec places from this helper's green contexts.
+   *
+   * The green contexts are already created by the helper; this only chooses how
+   * each exec place's affine data place is represented.
+   *
+   * @param use_green_ctx_data_place If true, each place's affine data place is the
+   *        green context extension; if false, the default device data place.
+   * @return exec_place_grid of green context places.
+   */
+  exec_place_grid get_grid(bool use_green_ctx_data_place = false) const
+  {
+    ::std::vector<exec_place> places;
+    places.reserve(ctxs.size());
+    for (size_t i = 0; i < ctxs.size(); i++)
+    {
+      places.push_back(exec_place::green_ctx(get_view(i), use_green_ctx_data_place));
+    }
+    return make_grid(mv(places));
+  }
+
 private:
   friend class exec_place;
 
diff --git a/cudax/include/cuda/experimental/__stf/places/places.cuh b/cudax/include/cuda/experimental/__stf/places/places.cuh
index d2de042452e..5f75d5b109d 100644
--- a/cudax/include/cuda/experimental/__stf/places/places.cuh
+++ b/cudax/include/cuda/experimental/__stf/places/places.cuh
@@ -60,6 +60,14 @@ class exec_place_green_ctx;
 //! Function type for computing executor placement from data coordinates
 using get_executor_func_t = pos4 (*)(pos4, dim4, dim4);
 
+class data_place;
+
+namespace reserved
+{
+void* allocate_composite_data_place(const data_place& p, ::std::ptrdiff_t size);
+void deallocate_composite_data_place(void* ptr);
+} // namespace reserved
+
 /**
  * @brief Designates where data will be stored (CPU memory vs. on device 0 (first GPU), device 1 (second GPU), ...)
  *
@@ -478,10 +486,13 @@ public:
     {
       cuda_safe_call(cudaMallocManaged(&result, size));
     }
+    else if (is_composite())
+    {
+      return reserved::allocate_composite_data_place(*this, size);
+    }
     else
     {
       // Device allocation
-      EXPECT(!is_composite(), "Composite places don't support direct allocation");
       const int prev_dev   = cuda_try<cudaGetDevice>();
       const int target_dev = devid;
 
@@ -524,6 +535,12 @@ public:
       return;
     }
 
+    if (is_composite())
+    {
+      reserved::deallocate_composite_data_place(ptr);
+      return;
+    }
+
     if (is_host())
     {
       cuda_safe_call(cudaFreeHost(ptr));
@@ -572,6 +589,11 @@ public:
     {
       return extension->allocation_is_stream_ordered();
     }
+    // Composite uses immediate VMM path, not stream-ordered APIs
+    if (is_composite())
+    {
+      return false;
+    }
     // Host and managed are immediate (stream ignored), device is stream-ordered
     return !is_host() && !is_managed();
   }
@@ -2194,3 +2216,5 @@ UNITTEST("Exec place as std::map key")
 };
 #endif // UNITTESTED_FILE
 } // end namespace cuda::experimental::stf
+
+#include <cuda/experimental/__stf/localization/composite_slice.cuh>
diff --git a/cudax/test/stf/CMakeLists.txt b/cudax/test/stf/CMakeLists.txt
index dd78f1c5f15..51df41b13d3 100644
--- a/cudax/test/stf/CMakeLists.txt
+++ b/cudax/test/stf/CMakeLists.txt
@@ -122,6 +122,7 @@ set(
   green_context/axpy_gc.cu
   green_context/cuda_graph.cu
   green_context/gc_grid.cu
+  green_context/gc_grid_vmm.cu
   hash/ctx_hash.cu
   hash/logical_data.cu
   hashtable/fusion.cu
diff --git a/cudax/test/stf/cpp/task_get_stream.cu b/cudax/test/stf/cpp/task_get_stream.cu
index 89fa74e7490..2d6509e5a73 100644
--- a/cudax/test/stf/cpp/task_get_stream.cu
+++ b/cudax/test/stf/cpp/task_get_stream.cu
@@ -24,7 +24,8 @@ void test_stream()
   context ctx;
 
   auto token = ctx.token();
-  auto t     = ctx.task(token.write());
+  EXPECT(token.is_void_interface());
+  auto t = ctx.task(token.write());
   t.start();
   cudaStream_t s = t.get_stream();
   EXPECT(s != nullptr);
diff --git a/cudax/test/stf/green_context/gc_grid_vmm.cu b/cudax/test/stf/green_context/gc_grid_vmm.cu
new file mode 100644
index 00000000000..313701ee3b7
--- /dev/null
+++ b/cudax/test/stf/green_context/gc_grid_vmm.cu
@@ -0,0 +1,100 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of CUDASTF in CUDA C++ Core Libraries,
+// under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+// SPDX-FileCopyrightText: Copyright (c) 2022-2026 NVIDIA CORPORATION & AFFILIATES.
+//
+//===----------------------------------------------------------------------===//
+
+/**
+ * @file
+ *
+ * @brief Test that a grid of green contexts with explicit affine data places
+ *        (green context data place extension) and a blocked policy forces
+ *        the VMM path for composite allocation.
+ *
+ * When the grid is made of exec_place::green_ctx(..., true), each place's
+ * affine data place is the green context extension. Forming a composite
+ * data place from that grid and using the modular API (cdp.allocate(n),
+ * cdp.deallocate(ptr, n)) triggers the localized_array (VMM) path.
+ */
+
+#include <cuda/experimental/__stf/places/blocked_partition.cuh>
+#include <cuda/experimental/__stf/places/exec/green_context.cuh>
+#include <cuda/experimental/stf.cuh>
+
+#include <vector>
+
+using namespace cuda::experimental::stf;
+
+#if _CCCL_CTK_AT_LEAST(12, 4)
+__global__ void init_kernel(double* ptr, size_t n)
+{
+  size_t i = blockIdx.x * blockDim.x + threadIdx.x;
+  if (i < n)
+  {
+    ptr[i] = static_cast<double>(i);
+  }
+}
+
+void run_gc_grid_vmm_test()
+{
+  const int num_sms = 8;
+  const int dev_id  = 0;
+  green_context_helper gc_helper(num_sms, dev_id);
+
+  if (gc_helper.get_count() < 1)
+  {
+    fprintf(stderr, "No green contexts available, skipping VMM path test.\n");
+    return;
+  }
+
+  // Grid of green context places with explicit affine data places (VMM path).
+  auto where = gc_helper.get_grid(true);
+
+  // Composite data place with blocked partitioner: allocation on this place
+  // uses the VMM path (localized_array).
+  data_place cdp = data_place::composite(blocked_partition(), where);
+
+  EXPECT(cdp.is_composite());
+  EXPECT(!cdp.allocation_is_stream_ordered());
+
+  // 8MB allocation to have enough blocks to test the VMM path.
+  const size_t n          = 1024 * 1024;
+  const size_t size_bytes = n * sizeof(double);
+
+  // Allocate directly on the composite place (VMM path)
+  void* ptr = cdp.allocate(size_bytes);
+  EXPECT(ptr != nullptr);
+
+  double* d_ptr = static_cast<double*>(ptr);
+  // Launch on default stream
+  int block = 256;
+  int grid  = static_cast<int>((n + block - 1) / block);
+  init_kernel<<<grid, block>>>(d_ptr, n);
+  cuda_safe_call(cudaDeviceSynchronize());
+
+  // Check initialized values: load entire buffer and verify every entry
+  ::std::vector<double> h_buf(n);
+  cuda_safe_call(cudaMemcpy(h_buf.data(), d_ptr, size_bytes, cudaMemcpyDeviceToHost));
+  for (size_t i = 0; i < n; i++)
+  {
+    EXPECT(h_buf[i] == static_cast<double>(i));
+  }
+
+  cdp.deallocate(ptr, size_bytes);
+}
+#endif // _CCCL_CTK_AT_LEAST(12, 4)
+
+int main()
+{
+#if _CCCL_CTK_BELOW(12, 4)
+  fprintf(stderr, "Green contexts are not supported by this version of CUDA: skipping test.\n");
+  return 0;
+#else
+  run_gc_grid_vmm_test();
+  return 0;
+#endif
+}
diff --git a/docs/python/index.rst b/docs/python/index.rst
index 24995ab1b3d..57e0afde188 100644
--- a/docs/python/index.rst
+++ b/docs/python/index.rst
@@ -14,6 +14,9 @@ abstractions for CUDA Python developers.
 * :doc:`cuda.coop <coop>` — Cooperative block- and warp-level algorithms for
   writing highly efficient CUDA kernels with `Numba CUDA <https://nvidia.github.io/numba-cuda/>`_.
 
+* :doc:`cuda.stf <stf>` — Sequential Task Flow for CUDA: define logical data and
+  tasks with read/write annotations; STF orchestrates execution and data movement.
+
 These libraries expose the generic, highly-optimized algorithms from the
 `CCCL C++ libraries <https://nvidia.github.io/cccl/cpp.html>`_,
 which have been tuned to provide optimal performance across GPU architectures.
@@ -34,5 +37,6 @@ Who is this for?
    setup
    compute
    coop
+   stf
    resources
    api_reference
diff --git a/docs/python/stf.rst b/docs/python/stf.rst
new file mode 100644
index 00000000000..3144976cdee
--- /dev/null
+++ b/docs/python/stf.rst
@@ -0,0 +1,97 @@
+.. _cccl-python-stf:
+
+``cuda.stf``: Sequential Task Flow
+==================================
+
+The ``cuda.stf`` module provides a Python binding to the **Sequential Task Flow (STF)**
+model for CUDA: you define logical data and submit tasks that read or write that data;
+STF infers dependencies and orchestrates execution and data movement. For the full
+description of the model, see the :ref:`C++ CUDASTF documentation <stf>`.
+
+Example
+-------
+
+The following example creates a context, registers three arrays as logical data, and
+submits four tasks with different read/write annotations. STF orders the tasks so that
+dependencies are respected (e.g. the task that writes ``Y`` runs after the one that
+reads ``X`` and writes ``Y``).
+
+.. literalinclude:: ../../python/cuda_cccl/tests/stf/test_context.py
+   :language: python
+   :pyobject: test_ctx3
+   :caption: Context, logical data, and tasks. `View complete source on GitHub <https://github.com/NVIDIA/cccl/blob/main/python/cuda_cccl/tests/stf/test_context.py>`__
+
+Context and logical data
+-------------------------
+
+Create a **context** with :func:`context() <cuda.stf.context>` (optionally
+``use_graph=True`` for CUDA graph execution). All logical data and tasks belong to
+one context. When you are done submitting tasks, call :meth:`finalize() <context.finalize>`
+to run the graph and synchronize (or let the context be destroyed; ``finalize`` is
+called automatically).
+
+**Logical data** represents a buffer that tasks access. Create it from existing
+buffers or allocate new ones:
+
+* :meth:`logical_data(buf, ...) <context.logical_data>` — from a NumPy array or any
+  object implementing the **CUDA Array Interface** (CuPy, PyTorch, Numba device arrays)
+  or the Python buffer protocol. For GPU arrays, pass :ref:`data_place <stf-data-place>`
+  (e.g. ``data_place.device(0)``).
+* :meth:`logical_data_empty(shape, dtype, ...) <context.logical_data_empty>` —
+  uninitialized allocation.
+* :meth:`logical_data_full(shape, fill_value, ...) <context.logical_data_full>` —
+  allocated and filled with a constant (like ``numpy.full()``).
+* :meth:`logical_data_zeros` / :meth:`logical_data_ones` — convenience wrappers.
+
+Pass each logical data into a task with an access mode: :meth:`read()`, :meth:`write()`,
+or :meth:`rw()`. Example: ``ctx.task(lX.read(), lY.rw())``.
+
+Tasks and interop
+-----------------
+
+Use ``with ctx.task(...) as t:`` to get a task handle. Inside the block:
+
+* **Stream** — :meth:`t.stream_ptr() <task.stream_ptr>` returns the task’s CUDA stream
+  (as an integer). Wrap it for your framework (e.g. ``numba.cuda.external_stream(t.stream_ptr())``
+  or ``torch.cuda.ExternalStream(t.stream_ptr())``).
+* **Buffer views** — :meth:`t.get_arg_cai(index) <task.get_arg_cai>` and
+  :meth:`t.args_cai() <task.args_cai>` return object(s) that implement the
+  **CUDA Array Interface**, so you can pass them to Numba (``cuda.from_cuda_array_interface(...)``),
+  PyTorch (``torch.as_tensor(...)``), or CuPy (``cp.asarray(...)``).
+
+The ``cuda.stf`` package does not ship Numba/PyTorch helpers; see
+`tests/stf/numba_helpers.py <https://github.com/NVIDIA/cccl/blob/main/python/cuda_cccl/tests/stf/numba_helpers.py>`_,
+`numba_decorator.py <https://github.com/NVIDIA/cccl/blob/main/python/cuda_cccl/tests/stf/numba_decorator.py>`_,
+and `pytorch_task.py <https://github.com/NVIDIA/cccl/blob/main/python/cuda_cccl/tests/stf/pytorch_task.py>`_
+for examples.
+
+Places
+------
+
+.. _stf-exec-place:
+
+* **Execution place** (``exec_place``) — where the task runs. Pass as the first
+  argument to ``ctx.task(...)``: ``exec_place.device(device_id)`` or ``exec_place.host()``.
+  Example: ``ctx.task(exec_place.device(0), lX.read(), lY.rw())``.
+
+.. _stf-data-place:
+
+* **Data place** (``data_place``) — where logical data lives: ``data_place.host()``,
+  ``data_place.device(device_id)``, ``data_place.managed()``. Use when creating
+  logical data or in a dependency, e.g. ``lZ.rw(data_place.device(1))``.
+
+Tokens
+------
+
+:meth:`ctx.token() <context.token>` creates a **token** (logical data with no buffer)
+for ordering tasks without data transfer. Use ``token.read()`` or ``token.rw()`` in
+task dependencies.
+
+Example collections
+-------------------
+
+For runnable examples (Numba kernels, PyTorch, tokens, multi-GPU, FDTD), see the
+`STF tests and examples <https://github.com/NVIDIA/cccl/tree/main/python/cuda_cccl/tests/stf>`_.
+
+For the full STF programming model, graph visualization, and C++ API, see
+:ref:`CUDASTF (C++) <stf>`.
diff --git a/python/cuda_cccl/CMakeLists.txt b/python/cuda_cccl/CMakeLists.txt
index 922458c02e2..5816853915e 100644
--- a/python/cuda_cccl/CMakeLists.txt
+++ b/python/cuda_cccl/CMakeLists.txt
@@ -17,9 +17,34 @@ message(
 
 # Build cccl.c.parallel and add CCCL's install rules
 set(_cccl_root ../..)
+
+# Build and install C++ library first
 set(CCCL_TOPLEVEL_PROJECT ON) # Enable the developer builds
 set(CCCL_ENABLE_C_PARALLEL ON) # Build the cccl.c.parallel library
+# STF C interface and Python bindings are not compatible with MSVC; disable on Windows.
+if (CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
+  set(CCCL_ENABLE_C_EXPERIMENTAL_STF OFF)
+  message(STATUS "STF disabled: not compatible with MSVC")
+else()
+  set(CCCL_ENABLE_C_EXPERIMENTAL_STF ON)
+endif()
+set(CCCL_ENABLE_UNSTABLE ON) # Enable unstable features
+
+# Disable all testing, examples, and benchmarks - we only want the libraries
+set(CCCL_ENABLE_TESTING OFF)
+set(CCCL_ENABLE_EXAMPLES OFF)
+set(CCCL_ENABLE_BENCHMARKS OFF)
+set(CCCL_C_PARALLEL_ENABLE_TESTING OFF)
+set(CCCL_C_EXPERIMENTAL_STF_ENABLE_TESTING OFF)
+# Note: CCCL_ENABLE_CUDAX must be ON because STF depends on it (via CCCL_ENABLE_UNSTABLE)
+# But disable cudax tests, examples, and header testing
+set(cudax_ENABLE_TESTING OFF)
+set(cudax_ENABLE_EXAMPLES OFF)
+set(cudax_ENABLE_HEADER_TESTING OFF)
+set(cudax_ENABLE_CUFILE OFF)
 set(CCCL_C_PARALLEL_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
+set(CCCL_C_EXPERIMENTAL_STF_LIBRARY_OUTPUT_DIRECTORY ${SKBUILD_PROJECT_NAME})
+
 # Just install the rest:
 set(libcudacxx_ENABLE_INSTALL_RULES ON)
 set(CUB_ENABLE_INSTALL_RULES ON)
@@ -34,10 +59,29 @@ add_subdirectory(${_cccl_root} _parent_cccl)
 set(CMAKE_INSTALL_LIBDIR "${old_libdir}") # pop
 set(CMAKE_INSTALL_INCLUDEDIR "${old_includedir}") # pop
 
-# ensure the destination directory exists
+# Create CCCL::cudax alias for STF (normally created by cccl-config.cmake)
+if (
+  CCCL_ENABLE_C_EXPERIMENTAL_STF
+  AND TARGET cudax::cudax
+  AND NOT TARGET CCCL::cudax
+)
+  add_library(CCCL::cudax ALIAS cudax::cudax)
+endif()
+
+# Ensure the destination directory exists
 file(MAKE_DIRECTORY "cuda/compute/${CUDA_VERSION_DIR}/cccl")
+if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
+  file(MAKE_DIRECTORY "cuda/stf/${CUDA_VERSION_DIR}/cccl")
+endif()
 
 # Install version-specific binaries
+if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
+  install(
+    TARGETS cccl.c.experimental.stf
+    DESTINATION cuda/stf/${CUDA_VERSION_DIR}/cccl
+  )
+endif()
+
 install(
   TARGETS cccl.c.parallel
   DESTINATION cuda/compute/${CUDA_VERSION_DIR}/cccl
@@ -116,6 +160,7 @@ python3_add_library(
   WITH_SOABI
   "${_generated_extension_src}"
 )
+
 add_dependencies(_bindings_impl cythonize_bindings_impl)
 target_link_libraries(
   _bindings_impl
@@ -126,3 +171,53 @@ target_link_libraries(
 set_target_properties(_bindings_impl PROPERTIES INSTALL_RPATH "$ORIGIN/cccl")
 
 install(TARGETS _bindings_impl DESTINATION cuda/compute/${CUDA_VERSION_DIR})
+
+if (CCCL_ENABLE_C_EXPERIMENTAL_STF)
+  message(STATUS "STF Using Cython ${CYTHON_VERSION}")
+  set(
+    stf_pyx_source_file
+    "${cuda_cccl_SOURCE_DIR}/cuda/stf/_stf_bindings_impl.pyx"
+  )
+  set(
+    _stf_generated_extension_src
+    "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c"
+  )
+  set(_stf_depfile "${cuda_cccl_BINARY_DIR}/_stf_bindings_impl.c.dep")
+  add_custom_command(
+    OUTPUT "${_stf_generated_extension_src}"
+    COMMAND "${Python3_EXECUTABLE}" -m cython
+    ARGS
+      ${CYTHON_FLAGS_LIST} "${stf_pyx_source_file}" --output-file
+      ${_stf_generated_extension_src}
+    DEPENDS "${stf_pyx_source_file}"
+    DEPFILE "${_stf_depfile}"
+    COMMENT "Cythonizing ${stf_pyx_source_file} for CUDA ${CUDA_VERSION_MAJOR}"
+  )
+  set_source_files_properties(
+    "${_stf_generated_extension_src}"
+    PROPERTIES GENERATED TRUE
+  )
+  add_custom_target(
+    cythonize_stf_bindings_impl
+    ALL
+    DEPENDS "${_stf_generated_extension_src}"
+  )
+
+  python3_add_library(
+    _stf_bindings_impl
+    MODULE
+    WITH_SOABI
+    "${_stf_generated_extension_src}"
+  )
+  add_dependencies(_stf_bindings_impl cythonize_stf_bindings_impl)
+  target_link_libraries(
+    _stf_bindings_impl
+    PRIVATE cccl.c.experimental.stf CUDA::cuda_driver
+  )
+  set_target_properties(
+    _stf_bindings_impl
+    PROPERTIES INSTALL_RPATH "$ORIGIN/cccl"
+  )
+
+  install(TARGETS _stf_bindings_impl DESTINATION cuda/stf/${CUDA_VERSION_DIR})
+endif()
diff --git a/python/cuda_cccl/cuda/stf/__init__.py b/python/cuda_cccl/cuda/stf/__init__.py
new file mode 100644
index 00000000000..1e230c66699
--- /dev/null
+++ b/python/cuda_cccl/cuda/stf/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from __future__ import annotations
+
+from ._stf_bindings import _BINDINGS_AVAILABLE  # type: ignore[attr-defined]
+
+if not _BINDINGS_AVAILABLE:
+    __all__ = ["_BINDINGS_AVAILABLE"]
+
+    def __getattr__(name: str):
+        raise AttributeError(
+            f"Cannot access 'cuda.stf.{name}' because CUDASTF bindings are not available. "
+            "This typically means you're running on a CPU-only machine without CUDA drivers installed, "
+            "or that cuda-cccl was not built with STF support."
+        )
+else:
+    from ._stf_bindings import (
+        context,
+        data_place,
+        dep,
+        exec_place,
+    )
+
+    __all__ = [
+        "_BINDINGS_AVAILABLE",
+        "context",
+        "dep",
+        "exec_place",
+        "data_place",
+    ]
diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings.py b/python/cuda_cccl/cuda/stf/_stf_bindings.py
new file mode 100644
index 00000000000..3fb42ca3d2c
--- /dev/null
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# _stf_bindings.py is a shim module that imports symbols from a
+# _stf_bindings_impl extension module. The shim serves the same purposes as
+# cuda.compute._bindings:
+#
+# 1. Import a CUDA-specific extension. The wheel ships cuda/stf/cu12/ and
+#   cuda/stf/cu13/; at runtime this shim chooses based on the detected CUDA
+#   version and imports all symbols from the matching extension.
+#
+# 2. Preload `nvrtc` and `nvJitLink` before importing the extension (indirect
+#   dependencies via cccl.c.parallel / cccl.c.experimental.stf).
+
+from __future__ import annotations
+
+import importlib
+
+from cuda.cccl._cuda_version_utils import detect_cuda_version, get_recommended_extra
+from cuda.pathfinder import (  # type: ignore[import-not-found]
+    load_nvidia_dynamic_lib,
+)
+
+
+def _load_cuda_libraries():
+    """
+    Preload CUDA libraries to ensure proper symbol resolution.
+
+    These libraries are indirect dependencies pulled in via cccl.c.parallel.
+    Preloading ensures reliable symbol resolution regardless of dynamic linker behavior.
+    """
+    import warnings
+
+    for libname in ("nvrtc", "nvJitLink"):
+        try:
+            load_nvidia_dynamic_lib(libname)
+        except Exception as e:
+            # Log warning but don't fail - the extension might still work
+            # if the libraries are already loaded or available through other means
+            warnings.warn(
+                f"Failed to preload CUDA library '{libname}': {e}. "
+                f"STF bindings may fail to load if {libname} is not available.",
+                RuntimeWarning,
+                stacklevel=2,
+            )
+
+
+_load_cuda_libraries()
+
+
+# Import the appropriate bindings implementation depending on what
+# CUDA version is available:
+cuda_version = detect_cuda_version()
+if cuda_version not in [12, 13]:
+    raise RuntimeError(
+        f"Unsupported CUDA version: {cuda_version}. Only CUDA 12 and 13 are supported."
+    )
+
+extra_name = get_recommended_extra(cuda_version)
+module_suffix = f".{extra_name}._stf_bindings_impl"
+
+_BINDINGS_AVAILABLE = False
+
+try:
+    bindings_module = importlib.import_module(module_suffix, __package__)
+    # Import all symbols from the module
+    globals().update(bindings_module.__dict__)
+    _BINDINGS_AVAILABLE = True
+except ImportError as e:
+    import warnings
+
+    warnings.warn(
+        f"CUDASTF bindings for CUDA {cuda_version} not available: {e}",
+        RuntimeWarning,
+    )
diff --git a/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
new file mode 100644
index 00000000000..1f06bfde426
--- /dev/null
+++ b/python/cuda_cccl/cuda/stf/_stf_bindings_impl.pyx
@@ -0,0 +1,841 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# distutils: language = c++
+# cython: language_level=3
+# cython: linetrace=True
+
+# Python signatures are declared in the companion Python stub file _bindings.pyi
+# Make sure to update PYI with change to Python API to ensure that Python
+# static type checker tools like mypy green-lights cuda.cccl.parallel
+
+from cpython.buffer cimport (
+    Py_buffer, PyBUF_FORMAT, PyBUF_ND, PyBUF_SIMPLE, PyBUF_ANY_CONTIGUOUS,
+    PyObject_GetBuffer, PyBuffer_Release, PyObject_CheckBuffer
+)
+from cpython.bytes cimport PyBytes_FromStringAndSize
+from cpython.pycapsule cimport (
+    PyCapsule_CheckExact, PyCapsule_IsValid, PyCapsule_GetPointer
+)
+from libc.stdint cimport uint8_t, uint32_t, uint64_t, int64_t, uintptr_t
+from libc.string cimport memset, memcpy
+
+import numpy as np
+
+import ctypes
+from enum import IntFlag
+
+cdef extern from "<cuda.h>":
+    cdef struct OpaqueCUstream_st
+    cdef struct OpaqueCUkernel_st
+    cdef struct OpaqueCUlibrary_st
+
+    ctypedef int CUresult
+    ctypedef OpaqueCUstream_st *CUstream
+    ctypedef OpaqueCUkernel_st *CUkernel
+    ctypedef OpaqueCUlibrary_st *CUlibrary
+
+cdef extern from "cccl/c/experimental/stf/stf.h":
+    #
+    # Contexts
+    #
+    ctypedef struct stf_ctx_handle_t
+    ctypedef stf_ctx_handle_t* stf_ctx_handle
+    void stf_ctx_create(stf_ctx_handle* ctx)
+    void stf_ctx_create_graph(stf_ctx_handle* ctx)
+    void stf_ctx_finalize(stf_ctx_handle ctx)
+
+    #
+    # Exec places
+    #
+    ctypedef enum stf_exec_place_kind:
+        STF_EXEC_PLACE_DEVICE
+        STF_EXEC_PLACE_HOST
+
+    ctypedef struct stf_exec_place_device:
+        int dev_id
+
+    ctypedef struct stf_exec_place_host:
+        int dummy
+
+    ctypedef union stf_exec_place_u:
+        stf_exec_place_device device
+        stf_exec_place_host   host
+
+    ctypedef struct stf_exec_place:
+        stf_exec_place_kind kind
+        stf_exec_place_u    u
+
+    stf_exec_place make_device_place(int  dev_id)
+    stf_exec_place make_host_place()
+
+    #
+    # Data places
+    #
+    ctypedef enum stf_data_place_kind:
+        STF_DATA_PLACE_DEVICE
+        STF_DATA_PLACE_HOST
+        STF_DATA_PLACE_MANAGED
+        STF_DATA_PLACE_AFFINE
+
+    ctypedef struct stf_data_place_device:
+        int dev_id
+
+    ctypedef struct stf_data_place_host:
+        int dummy
+
+    ctypedef struct stf_data_place_managed:
+        int dummy
+
+    ctypedef struct stf_data_place_affine:
+        int dummy
+
+    ctypedef union stf_data_place_u:
+        stf_data_place_device device
+        stf_data_place_host   host
+        stf_data_place_managed   managed
+        stf_data_place_affine   affine
+
+    ctypedef struct stf_data_place:
+        stf_data_place_kind kind
+        stf_data_place_u    u
+
+    stf_data_place make_device_data_place(int  dev_id)
+    stf_data_place make_host_data_place()
+    stf_data_place make_managed_data_place()
+    stf_data_place make_affine_data_place()
+
+    ctypedef struct stf_logical_data_handle_t
+    ctypedef stf_logical_data_handle_t* stf_logical_data_handle
+    void stf_logical_data(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz)
+    void stf_logical_data_with_place(stf_ctx_handle ctx, stf_logical_data_handle* ld, void* addr, size_t sz, stf_data_place dplace)
+    void stf_logical_data_set_symbol(stf_logical_data_handle ld, const char* symbol)
+    void stf_logical_data_destroy(stf_logical_data_handle ld)
+    void stf_logical_data_empty(stf_ctx_handle ctx, size_t length, stf_logical_data_handle *to)
+
+    void stf_token(stf_ctx_handle ctx, stf_logical_data_handle* ld);
+
+    ctypedef struct stf_task_handle_t
+    ctypedef stf_task_handle_t* stf_task_handle
+    void stf_task_create(stf_ctx_handle ctx, stf_task_handle* t)
+    void stf_task_set_exec_place(stf_task_handle t, stf_exec_place* exec_p)
+    void stf_task_set_symbol(stf_task_handle t, const char* symbol)
+    void stf_task_add_dep(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m)
+    void stf_task_add_dep_with_dplace(stf_task_handle t, stf_logical_data_handle ld, stf_access_mode m, stf_data_place* data_p)
+    void stf_task_start(stf_task_handle t)
+    void stf_task_end(stf_task_handle t)
+    void stf_task_enable_capture(stf_task_handle t)
+    CUstream stf_task_get_custream(stf_task_handle t)
+    void* stf_task_get(stf_task_handle t, int submitted_index)
+    void stf_task_destroy(stf_task_handle t)
+
+    cdef enum stf_access_mode:
+        STF_NONE
+        STF_READ
+        STF_WRITE
+        STF_RW
+
+class AccessMode(IntFlag):
+    NONE  = STF_NONE
+    READ  = STF_READ
+    WRITE = STF_WRITE
+    RW    = STF_RW
+
+class stf_cai:
+    """
+    Wrapper that exposes __cuda_array_interface__ for interop (torch, cupy, etc.).
+    Supports dict-style access (e.g. obj['data']) for code that expects a CAI dict.
+    """
+    def __init__(self, ptr, tuple shape, dtype, stream=0):
+        self.ptr = ptr               # integer device pointer
+        self.shape = shape
+        self.dtype = np.dtype(dtype)
+        self.stream = stream        # CUDA stream handle (int or 0)
+        self.__cuda_array_interface__ = {
+            'version': 2,
+            'shape': self.shape,
+            'typestr': self.dtype.str,     # e.g., '<f4' for float32
+            'data': (self.ptr, False),     # (ptr, read-only?)
+            'strides': None,               # or tuple of strides in bytes
+            'stream': self.stream if self.stream != 0 else None,  # CAI v3: 0 disallowed
+        }
+
+    def __getitem__(self, key):
+        return self.__cuda_array_interface__[key]
+
+    def get(self, key, default=None):
+        return self.__cuda_array_interface__.get(key, default)
+
+cdef class logical_data:
+    cdef stf_logical_data_handle _ld
+    cdef stf_ctx_handle _ctx
+
+    cdef object _dtype
+    cdef tuple  _shape
+    cdef int    _ndim
+    cdef size_t _len
+    cdef str    _symbol  # Store symbol for display purposes
+    cdef readonly bint _is_token  # readonly makes it accessible from Python
+
+    def __cinit__(self, context ctx=None, object buf=None, data_place dplace=None, shape=None, dtype=None, str name=None):
+        cdef Py_buffer view
+        cdef int flags
+
+        if ctx is None or buf is None:
+            # allow creation via __new__ (eg. in empty_like)
+            self._ld = NULL
+            self._ctx = NULL
+            self._len = 0
+            self._dtype = None
+            self._shape = ()
+            self._ndim = 0
+            self._symbol = None
+            self._is_token = False
+            return
+
+        self._ctx = ctx._ctx
+        self._symbol = None  # Initialize symbol
+        self._is_token = False  # Initialize token flag
+
+        # Default to host data place if not specified (matches C++ API)
+        if dplace is None:
+            dplace = data_place.host()
+
+        # Try CUDA Array Interface first
+        if hasattr(buf, '__cuda_array_interface__'):
+            cai = buf.__cuda_array_interface__
+
+            # Extract CAI information
+            data_ptr, readonly = cai['data']
+            original_shape = cai['shape']
+            typestr = cai['typestr']
+
+            # Handle vector types (e.g., wp.vec2, wp.vec3)
+            # Use structured dtype from descr if available
+            if typestr.startswith('|V') and 'descr' in cai:
+                # Vector/structured type - use descr field
+                self._dtype = np.dtype(cai['descr'])
+            else:
+                # Regular scalar type or vector without descr - use typestr
+                self._dtype = np.dtype(typestr)
+
+            # Shape is always the same regardless of type
+            self._shape = original_shape
+
+            self._ndim = len(self._shape)
+
+            # Calculate total size in bytes
+            itemsize = self._dtype.itemsize
+            total_items = 1
+            for dim in self._shape:
+                total_items *= dim
+            self._len = total_items * itemsize
+
+            # Create STF logical data using the new C API with data place specification
+            stf_logical_data_with_place(ctx._ctx, &self._ld, <void*><uintptr_t>data_ptr, self._len, dplace._c_place)
+
+        else:
+            # Fallback to Python buffer protocol
+            flags = PyBUF_FORMAT | PyBUF_ND          # request dtype + shape
+
+            if PyObject_GetBuffer(buf, &view, flags) != 0:
+                raise ValueError("object doesn't support the full buffer protocol or __cuda_array_interface__")
+
+            try:
+                self._ndim  = view.ndim
+                self._len = view.len
+                self._shape = tuple(<Py_ssize_t>view.shape[i] for i in range(view.ndim))
+                self._dtype = np.dtype(view.format)
+                # For buffer protocol objects, use the specified data place (defaults to host)
+                stf_logical_data_with_place(ctx._ctx, &self._ld, view.buf, view.len, dplace._c_place)
+
+            finally:
+                PyBuffer_Release(&view)
+
+        # Apply symbol name if provided
+        if name is not None:
+            self.set_symbol(name)
+
+
+    def set_symbol(self, str name):
+        stf_logical_data_set_symbol(self._ld, name.encode())
+        self._symbol = name  # Store locally for retrieval
+
+    @property
+    def symbol(self):
+        """Get the symbol name of this logical data, if set."""
+        return self._symbol
+
+    def __dealloc__(self):
+        if self._ld != NULL:
+            stf_logical_data_destroy(self._ld)
+            self._ld = NULL
+
+    def __repr__(self):
+        """Return a detailed string representation of the logical_data object."""
+        return (f"logical_data(shape={self._shape}, dtype={self._dtype}, "
+                f"is_token={self._is_token}, symbol={self._symbol!r}, "
+                f"len={self._len}, ndim={self._ndim})")
+
+    @property
+    def dtype(self):
+        """Return the dtype of the logical data."""
+        return self._dtype
+
+    @property
+    def shape(self):
+        """Return the shape of the logical data."""
+        return self._shape
+
+    def read(self, dplace=None):
+        return dep(self, AccessMode.READ.value, dplace)
+
+    def write(self, dplace=None):
+        return dep(self, AccessMode.WRITE.value, dplace)
+
+    def rw(self, dplace=None):
+        return dep(self, AccessMode.RW.value, dplace)
+
+    def empty_like(self):
+        """
+        Create a new logical_data with the same shape (and dtype metadata)
+        as this object.
+        """
+        if self._ld == NULL:
+            raise RuntimeError("source logical_data handle is NULL")
+
+        cdef logical_data out = logical_data.__new__(logical_data)
+        stf_logical_data_empty(self._ctx, self._len, &out._ld)
+        out._ctx   = self._ctx
+        out._dtype = self._dtype
+        out._shape = self._shape
+        out._ndim  = self._ndim
+        out._len   = self._len
+        out._symbol = None  # New object has no symbol initially
+        out._is_token = False
+
+        return out
+
+    @staticmethod
+    def token(context ctx):
+        cdef logical_data out = logical_data.__new__(logical_data)
+        out._ctx   = ctx._ctx
+        out._dtype = None
+        out._shape = None
+        out._ndim  = 0
+        out._len   = 0
+        out._symbol = None  # New object has no symbol initially
+        out._is_token = True
+        stf_token(ctx._ctx, &out._ld)
+
+        return out
+
+    @staticmethod
+    def init_by_shape(context ctx, shape, dtype, str name=None):
+        """
+        Create a new logical_data from a shape and a dtype.
+        """
+        cdef logical_data out = logical_data.__new__(logical_data)
+        out._ctx   = ctx._ctx
+        out._dtype = np.dtype(dtype)
+        out._shape = shape
+        out._ndim  = len(shape)
+        cdef size_t total_items = 1
+        for dim in shape:
+            total_items *= dim
+        out._len   = total_items * out._dtype.itemsize
+        out._symbol = None
+        out._is_token = False
+        stf_logical_data_empty(ctx._ctx, out._len, &out._ld)
+
+        if name is not None:
+            out.set_symbol(name)
+
+        return out
+
+    def borrow_ctx_handle(self):
+        ctx = context(borrowed=True)
+        ctx.borrow_from_handle(self._ctx)
+        return ctx
+
+class dep:
+    __slots__ = ("ld", "mode", "dplace")
+    def __init__(self, logical_data ld, int mode, dplace=None):
+        self.ld   = ld
+        self.mode = mode
+        self.dplace = dplace  # can be None or a data place
+    def __iter__(self):      # nice unpacking support
+        yield self.ld
+        yield self.mode
+        yield self.dplace
+    def __repr__(self):
+        return f"dep({self.ld!r}, {self.mode}, {self.dplace!r})"
+    def get_ld(self):
+        return self.ld
+
+def read(ld, dplace=None):   return dep(ld, AccessMode.READ.value, dplace)
+def write(ld, dplace=None):  return dep(ld, AccessMode.WRITE.value, dplace)
+def rw(ld, dplace=None):     return dep(ld, AccessMode.RW.value, dplace)
+
+cdef class exec_place:
+    cdef stf_exec_place _c_place
+
+    def __cinit__(self):
+        # empty default constructor; never directly used
+        pass
+
+    @staticmethod
+    def device(int dev_id):
+        cdef exec_place p = exec_place.__new__(exec_place)
+        p._c_place = make_device_place(dev_id)
+        return p
+
+    @staticmethod
+    def host():
+        cdef exec_place p = exec_place.__new__(exec_place)
+        p._c_place = make_host_place()
+        return p
+
+    @property
+    def kind(self) -> str:
+        return ("device" if self._c_place.kind == STF_EXEC_PLACE_DEVICE
+                else "host")
+
+    @property
+    def device_id(self) -> int:
+        if self._c_place.kind != STF_EXEC_PLACE_DEVICE:
+            raise AttributeError("not a device execution place")
+        return self._c_place.u.device.dev_id
+
+cdef class data_place:
+    cdef stf_data_place _c_place
+
+    def __cinit__(self):
+        # empty default constructor; never directly used
+        pass
+
+    @staticmethod
+    def device(int dev_id):
+        cdef data_place p = data_place.__new__(data_place)
+        p._c_place = make_device_data_place(dev_id)
+        return p
+
+    @staticmethod
+    def host():
+        cdef data_place p = data_place.__new__(data_place)
+        p._c_place = make_host_data_place()
+        return p
+
+    @staticmethod
+    def managed():
+        cdef data_place p = data_place.__new__(data_place)
+        p._c_place = make_managed_data_place()
+        return p
+
+    @staticmethod
+    def affine():
+        cdef data_place p = data_place.__new__(data_place)
+        p._c_place = make_affine_data_place()
+        return p
+
+    @property
+    def kind(self) -> str:
+        cdef stf_data_place_kind k = self._c_place.kind
+        if k == STF_DATA_PLACE_DEVICE:
+            return "device"
+        elif k == STF_DATA_PLACE_HOST:
+            return "host"
+        elif k == STF_DATA_PLACE_MANAGED:
+            return "managed"
+        elif k == STF_DATA_PLACE_AFFINE:
+            return "affine"
+        else:
+            raise ValueError(f"Unknown data place kind: {k}")
+
+    @property
+    def device_id(self) -> int:
+        if self._c_place.kind != STF_DATA_PLACE_DEVICE:
+            raise AttributeError("not a device data place")
+        return self._c_place.u.device.dev_id
+
+
+
+cdef class task:
+    cdef stf_task_handle _t
+
+    # list of logical data in deps: we need this because we can't exchange
+    # dtype/shape easily through the C API of STF
+    cdef list _lds_args
+
+    def __cinit__(self, context ctx):
+        stf_task_create(ctx._ctx, &self._t)
+        self._lds_args = []
+
+    def __dealloc__(self):
+        if self._t != NULL:
+             stf_task_destroy(self._t)
+
+    def start(self):
+        # This is ignored if this is not a graph task
+        stf_task_enable_capture(self._t)
+
+        stf_task_start(self._t)
+
+    def end(self):
+        stf_task_end(self._t)
+
+    def add_dep(self, object d):
+        """
+        Accept a `dep` instance created with read(ld), write(ld), or rw(ld).
+        """
+        if not isinstance(d, dep):
+            raise TypeError("add_dep expects read(ld), write(ld) or rw(ld)")
+
+        cdef logical_data ldata = <logical_data> d.ld
+        cdef int           mode_int  = int(d.mode)
+        cdef stf_access_mode mode_ce = <stf_access_mode> mode_int
+        cdef data_place dp
+
+        if d.dplace is None:
+            stf_task_add_dep(self._t, ldata._ld, mode_ce)
+        else:
+            dp = <data_place> d.dplace
+            stf_task_add_dep_with_dplace(self._t, ldata._ld, mode_ce, &dp._c_place)
+
+        self._lds_args.append(ldata)
+
+    def set_exec_place(self, object exec_p):
+       if not isinstance(exec_p, exec_place):
+           raise TypeError("set_exec_place expects and exec_place argument")
+
+       cdef exec_place ep = <exec_place> exec_p
+       stf_task_set_exec_place(self._t, &ep._c_place)
+
+    def stream_ptr(self) -> int:
+        """
+        Return the raw CUstream pointer as a Python int
+        (memory address).  Suitable for ctypes or PyCUDA.
+        """
+        cdef CUstream s = stf_task_get_custream(self._t)
+        return <uintptr_t> s         # cast pointer -> Py int
+
+    def get_arg(self, index) -> int:
+        if self._lds_args[index]._is_token:
+           raise RuntimeError("cannot materialize a token argument")
+
+        cdef void *ptr = stf_task_get(self._t, index)
+        return <uintptr_t>ptr
+
+    def get_arg_cai(self, index):
+        """Return the argument as an stf_cai object (has __cuda_array_interface__; supports obj['data'] etc.).
+        The underlying memory is owned by the task/context; keep the task (or context) alive while using the returned view."""
+        ptr = self.get_arg(index)
+        return stf_cai(ptr, self._lds_args[index].shape, self._lds_args[index].dtype, stream=self.stream_ptr())
+
+    def args_cai(self):
+        """
+        Return all non-token buffer arguments as stf_cai objects (have __cuda_array_interface__).
+        Returns None, a single object, or a tuple. Use from non-shipped code (e.g. tests) to
+        convert to numba/torch/cupy via from_cuda_array_interface or torch.as_tensor(obj).
+        Keep the task (or context) alive while any consumer uses the returned view(s).
+        """
+        non_token_cais = [self.get_arg_cai(i) for i in range(len(self._lds_args))
+                          if not self._lds_args[i]._is_token]
+
+        if len(non_token_cais) == 0:
+            return None
+        elif len(non_token_cais) == 1:
+            return non_token_cais[0]
+        return tuple(non_token_cais)
+
+    # ---- context‑manager helpers -------------------------------
+    def __enter__(self):
+        self.start()
+        return self
+
+    def __exit__(self, object exc_type, object exc, object tb):
+        """
+        Always called, even if an exception occurred inside the block.
+        """
+        self.end()
+        return False
+
+cdef class context:
+    cdef stf_ctx_handle _ctx
+    # Is this a context that we have borrowed ?
+    cdef bint _borrowed
+
+    def __cinit__(self, bint use_graph=False, bint borrowed=False):
+        self._ctx = <stf_ctx_handle>NULL
+        self._borrowed = borrowed
+        if not borrowed:
+            if use_graph:
+                stf_ctx_create_graph(&self._ctx)
+            else:
+                stf_ctx_create(&self._ctx)
+
+    cdef borrow_from_handle(self, stf_ctx_handle ctx_handle):
+        if self._ctx != NULL:
+            raise RuntimeError("context already initialized")
+
+        if not self._borrowed:
+            raise RuntimeError("cannot call borrow_from_handle on this context")
+
+        self._ctx = ctx_handle
+
+    def __repr__(self):
+        return f"context(handle={<uintptr_t>self._ctx}, borrowed={self._borrowed})"
+
+    def __dealloc__(self):
+        if not self._borrowed:
+            self.finalize()
+
+    def finalize(self):
+        if self._borrowed:
+            raise RuntimeError("cannot finalize borrowed context")
+
+        if self._ctx != NULL:
+                stf_ctx_finalize(self._ctx)
+        self._ctx = NULL
+
+    def logical_data(self, object buf, data_place dplace=None, str name=None):
+        """
+        Create and return a `logical_data` object bound to this context [PRIMARY API].
+
+        This is the primary function for creating logical data from existing buffers.
+        It supports both Python buffer protocol objects and CUDA Array Interface objects,
+        with explicit data_place specification for optimal STF data movement strategies.
+
+        Parameters
+        ----------
+        buf : any buffer‑supporting Python object or __cuda_array_interface__ object
+              (NumPy array, Warp array, CuPy array, bytes, bytearray, memoryview, …)
+        dplace : data_place, optional
+              Specifies where the buffer is located (host, device, managed, affine).
+              Defaults to data_place.host() for backward compatibility.
+              Essential for GPU arrays - use data_place.device() for optimal performance.
+        name : str, optional
+              Symbol name for debugging and DOT graph output.
+
+        Examples
+        --------
+        >>> # Host memory (explicit - recommended)
+        >>> host_place = data_place.host()
+        >>> ld = ctx.logical_data(numpy_array, host_place)
+        >>>
+        >>> # GPU device memory (recommended for CUDA arrays)
+        >>> device_place = data_place.device(0)
+        >>> ld = ctx.logical_data(warp_array, device_place)
+        >>>
+        >>> # With a symbol name for debugging
+        >>> ld = ctx.logical_data(numpy_array, name="X")
+        >>>
+        >>> # Backward compatibility (defaults to host)
+        >>> ld = ctx.logical_data(numpy_array)  # Same as specifying host
+
+        Note
+        ----
+        For GPU arrays (Warp, CuPy, etc.), always specify data_place.device()
+        for zero-copy performance and correct memory management.
+        """
+        return logical_data(self, buf, dplace, name=name)
+
+
+    def logical_data_empty(self, shape, dtype=None, str name=None):
+        """
+        Create logical data with uninitialized values.
+
+        Equivalent to numpy.empty() but for STF logical data.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        dtype : numpy.dtype, optional
+            Data type. Defaults to np.float64.
+        name : str, optional
+            Symbol name for debugging and DOT graph output.
+
+        Returns
+        -------
+        logical_data
+            New logical data with uninitialized values
+
+        Examples
+        --------
+        >>> # Create uninitialized array (fast but contains garbage)
+        >>> ld = ctx.logical_data_empty((100, 100), dtype=np.float32)
+
+        >>> # Fast allocation without initialization
+        >>> ld = ctx.logical_data_empty((50, 50, 50), name="tmp")
+        """
+        if dtype is None:
+            dtype = np.float64
+        return logical_data.init_by_shape(self, shape, dtype, name)
+
+    def logical_data_full(self, shape, fill_value, dtype=None, where=None, exec_place=None, str name=None):
+        """
+        Create logical data initialized with a constant value.
+
+        Similar to numpy.full(), this creates a new logical data with the given
+        shape and fills it with fill_value.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        fill_value : scalar
+            Value to fill the array with
+        dtype : numpy.dtype, optional
+            Data type. If None, infer from fill_value.
+        where : data_place, optional
+            Data placement for initialization. Defaults to current device.
+        exec_place : exec_place, optional
+            Execution place for the fill operation. Defaults to current device.
+            Note: exec_place.host() is not yet supported.
+        name : str, optional
+            Symbol name for debugging and DOT graph output.
+
+        Returns
+        -------
+        logical_data
+            New logical data initialized with fill_value
+
+        Examples
+        --------
+        >>> # Create array filled with epsilon0 on current device
+        >>> ld = ctx.logical_data_full((100, 100), 8.85e-12, dtype=np.float64)
+
+        >>> # Create array on host memory
+        >>> ld = ctx.logical_data_full((50, 50), 1.0, where=data_place.host())
+
+        >>> # With a symbol name
+        >>> ld = ctx.logical_data_full((200, 200), 0.0, name="epsilon")
+        """
+        # Infer dtype from fill_value if not provided
+        if dtype is None:
+            dtype = np.array(fill_value).dtype
+        else:
+            dtype = np.dtype(dtype)
+
+        # Validate exec_place - host execution not yet supported
+        if exec_place is not None:
+            if hasattr(exec_place, 'kind') and exec_place.kind == "host":
+                raise NotImplementedError(
+                    "exec_place.host() is not yet supported for logical_data_full. "
+                    "Use exec_place.device() or omit exec_place parameter."
+                )
+
+        # Create empty logical data
+        ld = self.logical_data_empty(shape, dtype, name)
+
+        # Initialize with the specified value (cuda.core.Buffer.fill; CuPy/Numba fallback for 8-byte)
+        try:
+            from cuda.stf.fill_utils import init_logical_data
+            init_logical_data(self, ld, fill_value, where, exec_place)
+        except ImportError as e:
+            raise RuntimeError("Fill support (cuda.core) is not available for logical_data_full") from e
+
+        return ld
+
+    def logical_data_zeros(self, shape, dtype=None, where=None, exec_place=None, str name=None):
+        """
+        Create logical data filled with zeros.
+
+        Equivalent to numpy.zeros() but for STF logical data.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        dtype : numpy.dtype, optional
+            Data type. Defaults to np.float64.
+        where : data_place, optional
+            Data placement. Defaults to current device.
+        exec_place : exec_place, optional
+            Execution place for the fill operation. Defaults to current device.
+        name : str, optional
+            Symbol name for debugging and DOT graph output.
+
+        Returns
+        -------
+        logical_data
+            New logical data filled with zeros
+
+        Examples
+        --------
+        >>> # Create zero-filled array
+        >>> ld = ctx.logical_data_zeros((100, 100), dtype=np.float32)
+
+        >>> # Create on host memory with a name
+        >>> ld = ctx.logical_data_zeros((50, 50), where=data_place.host(), name="Z")
+        """
+        if dtype is None:
+            dtype = np.float64
+        return self.logical_data_full(shape, 0.0, dtype, where, exec_place, name)
+
+    def logical_data_ones(self, shape, dtype=None, where=None, exec_place=None, str name=None):
+        """
+        Create logical data filled with ones.
+
+        Equivalent to numpy.ones() but for STF logical data.
+
+        Parameters
+        ----------
+        shape : tuple
+            Shape of the array
+        dtype : numpy.dtype, optional
+            Data type. Defaults to np.float64.
+        where : data_place, optional
+            Data placement. Defaults to current device.
+        exec_place : exec_place, optional
+            Execution place for the fill operation. Defaults to current device.
+        name : str, optional
+            Symbol name for debugging and DOT graph output.
+
+        Returns
+        -------
+        logical_data
+            New logical data filled with ones
+
+        Examples
+        --------
+        >>> # Create ones-filled array
+        >>> ld = ctx.logical_data_ones((100, 100), dtype=np.float32)
+
+        >>> # Create on specific device with a name
+        >>> ld = ctx.logical_data_ones((50, 50), name="ones")
+        """
+        if dtype is None:
+            dtype = np.float64
+        return self.logical_data_full(shape, 1.0, dtype, where, exec_place, name)
+
+    def token(self):
+        return logical_data.token(self)
+
+    def task(self, *args):
+        """
+        Create a `task`
+
+        Example
+        -------
+        >>> t = ctx.task(read(lX), rw(lY))
+        >>> t.start()
+        >>> t.end()
+        """
+        exec_place_set = False
+        t = task(self)          # construct with this context
+        for d in args:
+            if isinstance(d, dep):
+                t.add_dep(d)
+            elif isinstance(d, exec_place):
+                if exec_place_set:
+                      raise ValueError("Only one exec_place can be given")
+                t.set_exec_place(d)
+                exec_place_set = True
+            else:
+                raise TypeError(
+                    "Arguments must be dependency objects or an exec_place"
+                )
+        return t
diff --git a/python/cuda_cccl/cuda/stf/fill_utils.py b/python/cuda_cccl/cuda/stf/fill_utils.py
new file mode 100644
index 00000000000..9d27f01342b
--- /dev/null
+++ b/python/cuda_cccl/cuda/stf/fill_utils.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Fill / init for STF logical data. Uses cuda.core.Buffer.fill for 1/2/4-byte;
+single fallback (CuPy) for 8-byte.
+"""
+
+import numpy as np
+
+from cuda.core import Buffer, Stream
+
+
+def init_logical_data(ctx, ld, value, data_place=None, exec_place=None):
+    """
+    Initialize a logical data with a constant value.
+
+    Uses cuda.core.Buffer.fill for 1/2/4-byte element types. For 8-byte types
+    (e.g. float64, int64) uses CuPy if available; otherwise raises.
+
+    Parameters
+    ----------
+    ctx : context
+        STF context
+    ld : logical_data
+        Logical data to initialize
+    value : scalar
+        Value to fill the array with
+    data_place : data_place, optional
+        Data place for the initialization task
+    exec_place : exec_place, optional
+        Execution place for the fill operation
+
+    Raises
+    ------
+    ImportError
+        If dtype is 8-byte and CuPy is not installed.
+    """
+    dep_arg = ld.write(data_place) if data_place else ld.write()
+
+    task_args = []
+    if exec_place is not None:
+        task_args.append(exec_place)
+    task_args.append(dep_arg)
+
+    with ctx.task(*task_args) as t:
+        ld_index = 1 if exec_place is not None else 0
+        cai = t.get_arg_cai(ld_index)
+        ptr = cai["data"][0]
+        shape = tuple(cai["shape"])
+        dtype = np.dtype(cai["typestr"])
+        size = int(np.prod(shape)) * dtype.itemsize
+
+        core_stream = Stream.from_handle(t.stream_ptr())
+        buf = Buffer.from_handle(ptr, size, owner=None)
+
+        if dtype.itemsize in (1, 2, 4):
+            if value == 0 or value == 0.0:
+                fill_val = 0
+            else:
+                fill_val = np.array([value], dtype=dtype).tobytes()
+            buf.fill(fill_val, stream=core_stream)
+        else:
+            # 8-byte: single fallback via CuPy
+            _fill_8byte_cupy(shape, dtype, value, ptr, size, t.stream_ptr())
+
+
+def _fill_8byte_cupy(shape, dtype, value, ptr, size, stream_ptr):
+    """Fill 8-byte buffer using CuPy. Raises ImportError if CuPy not available."""
+    try:
+        import cupy as cp
+    except ImportError:
+        raise ImportError(
+            "Fill for 8-byte dtypes (e.g. float64) requires CuPy. "
+            "Install CuPy or use a 1/2/4-byte dtype (e.g. np.float32)."
+        ) from None
+
+    mem = cp.cuda.UnownedMemory(ptr, size, owner=None)
+    memptr = cp.cuda.MemoryPointer(mem, 0)
+    arr = cp.ndarray(shape, dtype=dtype, memptr=memptr)
+    with cp.cuda.ExternalStream(stream_ptr):
+        arr.fill(value)
diff --git a/python/cuda_cccl/merge_cuda_wheels.py b/python/cuda_cccl/merge_cuda_wheels.py
index 71ead1ebbb9..0246d83ac16 100755
--- a/python/cuda_cccl/merge_cuda_wheels.py
+++ b/python/cuda_cccl/merge_cuda_wheels.py
@@ -5,14 +5,13 @@
 This script takes wheels built for different CUDA versions (cu12, cu13) and merges them
 into a single wheel that supports both CUDA versions.
 
-In particular, each wheel contains a CUDA-specific build of the `cccl.c.parallel` library
-and the associated bindings. These are present in the directory `compute/cu<version>`.
-For example, for a wheel built with CUDA 12, the directory is `compute/cu12`,
-and for a wheel built with CUDA 13, the directory is `compute/cu13`.
-This script merges these directories into a single wheel that supports both CUDA versions, i.e.,
-containing both `compute/cu12` and `compute/cu13`.
-At runtime, a shim module `compute/_bindings.py` is used to import the appropriate
-CUDA-specific bindings. See `compute/_bindings.py` for more details.
+Each wheel contains CUDA-specific builds in versioned directories:
+- `cuda/compute/cu<version>` — cccl.c.parallel and compute bindings
+- `cuda/stf/cu<version>` — cccl.c.experimental.stf and STF bindings
+
+This script merges these directories so the final wheel contains both cu12 and cu13
+for compute and for stf. At runtime, shim modules choose the right extension from
+the detected CUDA version.
 """
 
 import argparse
@@ -100,18 +99,19 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # now copy the version-specific directory from other wheels
-        # into the appropriate place in the base wheel
+        # Copy version-specific directories (compute and stf) from other wheels into base
         for i, wheel_dir in enumerate(extracted_wheels):
-            cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
             if i == 0:
-                # For base wheel, do nothing
                 continue
-            else:
-                version_dir = Path("cuda") / "compute" / f"cu{cuda_version}"
-                # Copy from other wheels
+            cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
+            for subpackage in ("compute", "stf"):
+                version_dir = Path("cuda") / subpackage / f"cu{cuda_version}"
+                src = wheel_dir / version_dir
+                if not src.is_dir():
+                    continue
+                dst = base_wheel / version_dir
                 print(f"  Copying {version_dir} to {base_wheel}")
-                shutil.copytree(wheel_dir / version_dir, base_wheel / version_dir)
+                shutil.copytree(src, dst)
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/python/cuda_cccl/pyproject.toml b/python/cuda_cccl/pyproject.toml
index b5fad006ba9..a85b3eae170 100644
--- a/python/cuda_cccl/pyproject.toml
+++ b/python/cuda_cccl/pyproject.toml
@@ -129,6 +129,7 @@ known-first-party = [
   "cuda.cccl.headers",
   "cuda.compute",
   "cuda.coop",
+  "cuda.stf",
 ]
 
 [tool.pytest.ini_options]
diff --git a/python/cuda_cccl/tests/stf/example_cholesky.py b/python/cuda_cccl/tests/stf/example_cholesky.py
new file mode 100755
index 00000000000..b89043ed577
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/example_cholesky.py
@@ -0,0 +1,711 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Python implementation of Cholesky decomposition using CUDA STF and CuPy (CUBLAS/CUSOLVER).
+
+This example demonstrates:
+- Tiled matrix operations with STF logical data
+- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks
+- Multi-device execution with automatic data placement
+- Task-based parallelism for linear algebra operations
+
+Note: CUDASTF automatically manages device context within tasks via exec_place.device().
+There's no need to manually set the current device in task bodies - just use the STF stream.
+"""
+
+import sys
+
+import cupy as cp
+import numpy as np
+from cupyx.scipy import linalg as cp_linalg
+
+import cuda.stf as stf
+
+
+class CAIWrapper:
+    """Wrapper to expose CUDA Array Interface dict as a proper CAI object."""
+
+    def __init__(self, cai_dict):
+        self.__cuda_array_interface__ = cai_dict
+
+
+def get_cupy_arrays(task):
+    """
+    Get all CuPy arrays from STF task arguments.
+
+    Usage:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+    """
+    arrays = []
+    idx = 0
+    while True:
+        try:
+            arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx))))
+            idx += 1
+        except Exception:
+            break
+    return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None
+
+
+def cai_to_numpy(cai_dict):
+    """Convert CUDA Array Interface dict to NumPy array (for host memory)."""
+    import ctypes
+
+    # Extract CAI fields
+    data_ptr, readonly = cai_dict["data"]
+    shape = cai_dict["shape"]
+    typestr = cai_dict["typestr"]
+
+    # Convert typestr to NumPy dtype
+    dtype = np.dtype(typestr)
+
+    # Calculate total size in bytes
+    itemsize = dtype.itemsize
+    size = np.prod(shape) * itemsize
+
+    # Create ctypes buffer from pointer
+    buffer = (ctypes.c_byte * size).from_address(data_ptr)
+
+    # Create NumPy array from buffer
+    arr = np.frombuffer(buffer, dtype=dtype).reshape(shape)
+
+    return arr
+
+
+class BlockRef:
+    """Reference to a specific block in a tiled matrix."""
+
+    def __init__(self, matrix, row, col):
+        self.matrix = matrix
+        self.row = row
+        self.col = col
+        self._handle = matrix.handle(row, col)
+        self._devid = matrix.get_preferred_devid(row, col)
+
+    def handle(self):
+        """Get the STF logical data handle for this block."""
+        return self._handle
+
+    def devid(self):
+        """Get the preferred device ID for this block."""
+        return self._devid
+
+    def __repr__(self):
+        return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])"
+
+
+class TiledMatrix:
+    """
+    Tiled matrix class that splits a matrix into blocks for parallel processing.
+    Each block is managed as an STF logical data object.
+    """
+
+    def __init__(
+        self,
+        ctx,
+        nrows,
+        ncols,
+        block_rows,
+        block_cols,
+        is_symmetric=False,
+        symbol="matrix",
+        dtype=np.float64,
+    ):
+        """
+        Initialize a tiled matrix.
+
+        Args:
+            ctx: STF context
+            nrows: Total number of rows
+            ncols: Total number of columns
+            block_rows: Block size (rows)
+            block_cols: Block size (columns)
+            is_symmetric: If True, only stores lower triangular blocks
+            symbol: Name/symbol for the matrix
+            dtype: Data type (default: np.float64)
+        """
+        self.ctx = ctx
+        self.symbol = symbol
+        self.dtype = dtype
+
+        self.m = nrows
+        self.n = ncols
+        self.mb = block_rows
+        self.nb = block_cols
+        self.sym_matrix = is_symmetric
+
+        assert self.m % self.mb == 0, (
+            f"nrows ({self.m}) must be divisible by block_rows ({self.mb})"
+        )
+        assert self.n % self.nb == 0, (
+            f"ncols ({self.n}) must be divisible by block_cols ({self.nb})"
+        )
+
+        # Number of blocks
+        self.mt = self.m // self.mb
+        self.nt = self.n // self.nb
+
+        # Allocate host memory (pinned for faster transfers)
+        self.h_array = cp.cuda.alloc_pinned_memory(
+            self.m * self.n * np.dtype(dtype).itemsize
+        )
+        self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(
+            self.m, self.n
+        )
+
+        # Create logical data handles for each block
+        self.handles = {}
+
+        # Get available devices for mapping
+        self.ndevs = cp.cuda.runtime.getDeviceCount()
+        self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs)
+
+        print(
+            f"[{symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}"
+        )
+        print(
+            f"[{symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid"
+        )
+
+        # Note: We DON'T create logical data here yet - that happens in fill()
+        # after the host data is initialized
+
+    def _compute_device_grid(self, ndevs):
+        """Compute 2D device grid dimensions (as close to square as possible)"""
+        grid_p = 1
+        grid_q = ndevs
+        for a in range(1, int(np.sqrt(ndevs)) + 1):
+            if ndevs % a == 0:
+                grid_p = a
+                grid_q = ndevs // a
+        return grid_p, grid_q
+
+    def get_preferred_devid(self, row, col):
+        """Get preferred device ID for a given block using cyclic distribution"""
+        return (row % self.grid_p) + (col % self.grid_q) * self.grid_p
+
+    def handle(self, row, col):
+        """Get the logical data handle for block (row, col)"""
+        return self.handles[(row, col)]
+
+    def block(self, row, col):
+        """Get a BlockRef for block (row, col)"""
+        return BlockRef(self, row, col)
+
+    def _get_index(self, row, col):
+        """Convert (row, col) to linear index in tiled storage"""
+        # Find which tile contains this element
+        tile_row = row // self.mb
+        tile_col = col // self.nb
+
+        tile_size = self.mb * self.nb
+
+        # Index of the beginning of the tile
+        tile_start = (tile_row + self.mt * tile_col) * tile_size
+
+        # Offset within the tile
+        offset = (row % self.mb) + (col % self.nb) * self.mb
+
+        return tile_start + offset
+
+    def _get_block_h(self, brow, bcol):
+        """Get a view of the host data for block (brow, bcol)"""
+        # For tiled storage, blocks are stored contiguously
+        start_idx = (brow + self.mt * bcol) * self.mb * self.nb
+        end_idx = start_idx + self.mb * self.nb
+        flat_view = self.h_array_np.ravel()
+        return flat_view[start_idx:end_idx].reshape(self.mb, self.nb)
+
+    def fill(self, func):
+        """Fill matrix on host, then create STF logical data that will transfer automatically"""
+        print(f"[{self.symbol}] Filling matrix on host...")
+
+        for colb in range(self.nt):
+            low_rowb = colb if self.sym_matrix else 0
+            for rowb in range(low_rowb, self.mt):
+                # Fill host block
+                h_block = self._get_block_h(rowb, colb)
+                for lrow in range(self.mb):
+                    for lcol in range(self.nb):
+                        row = lrow + rowb * self.mb
+                        col = lcol + colb * self.nb
+                        h_block[lrow, lcol] = func(row, col)
+
+                handle = self.ctx.logical_data(
+                    h_block, name=f"{self.symbol}_{rowb}_{colb}"
+                )
+                self.handles[(rowb, colb)] = handle
+
+
+# BLAS/LAPACK operations wrapped in STF tasks
+
+
+def DPOTRF(ctx, a):
+    """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            d_block[:] = cp.linalg.cholesky(d_block)
+
+
+def DTRSM(ctx, a, b, side="L", uplo="L", transa="T", diag="N", alpha=1.0):
+    """Triangular solve: B = alpha * op(A)^{-1} @ B or B = alpha * B @ op(A)^{-1}"""
+    with ctx.task(
+        stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()
+    ) as t:
+        d_a, d_b = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            if side == "L":
+                if transa == "N":
+                    d_b[:] = cp_linalg.solve_triangular(d_a, d_b, lower=(uplo == "L"))
+                else:
+                    d_b[:] = cp_linalg.solve_triangular(d_a.T, d_b, lower=(uplo != "L"))
+                if alpha != 1.0:
+                    d_b *= alpha
+            else:
+                if transa == "N":
+                    d_b[:] = cp_linalg.solve_triangular(
+                        d_a.T, d_b.T, lower=(uplo != "L")
+                    ).T
+                else:
+                    d_b[:] = cp_linalg.solve_triangular(
+                        d_a, d_b.T, lower=(uplo == "L")
+                    ).T
+                if alpha != 1.0:
+                    d_b *= alpha
+
+
+def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0):
+    """Matrix multiplication: C = alpha * op(A) @ op(B) + beta * C"""
+    with ctx.task(
+        stf.exec_place.device(c.devid()),
+        a.handle().read(),
+        b.handle().read(),
+        c.handle().rw(),
+    ) as t:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if transa != "N" else d_a
+            op_b = d_b.T if transb != "N" else d_b
+
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_b)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_b)
+            else:
+                d_c[:] = alpha * (op_a @ op_b) + beta * d_c
+
+
+def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0):
+    """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
+    with ctx.task(
+        stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()
+    ) as t:
+        d_a, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if trans != "N" else d_a
+
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_a.T)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_a.T)
+            else:
+                d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c
+
+
+# High-level algorithms
+
+
+def PDPOTRF(ctx, A):
+    """Parallel tiled Cholesky factorization (blocked algorithm)"""
+    print("\n[PDPOTRF] Starting Cholesky factorization...")
+
+    assert A.m == A.n, "Matrix must be square"
+    assert A.mt == A.nt, "Block grid must be square"
+    assert A.sym_matrix, "Matrix must be symmetric"
+
+    nblocks = A.mt
+
+    for k in range(nblocks):
+        # Factor diagonal block
+        DPOTRF(ctx, A.block(k, k))
+
+        # Solve triangular systems for blocks in column k
+        for row in range(k + 1, nblocks):
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(row, k),
+                side="R",
+                uplo="L",
+                transa="T",
+                diag="N",
+                alpha=1.0,
+            )
+
+            # Update trailing matrix
+            for col in range(k + 1, row):
+                DGEMM(
+                    ctx,
+                    A.block(row, k),
+                    A.block(col, k),
+                    A.block(row, col),
+                    transa="N",
+                    transb="T",
+                    alpha=-1.0,
+                    beta=1.0,
+                )
+
+            # Symmetric rank-k update of diagonal block
+            DSYRK(
+                ctx,
+                A.block(row, k),
+                A.block(row, row),
+                uplo="L",
+                trans="N",
+                alpha=-1.0,
+                beta=1.0,
+            )
+
+    print("[PDPOTRF] Completed")
+
+
+def PDTRSM(ctx, A, B, side="L", uplo="L", trans="N", diag="N", alpha=1.0):
+    """Parallel tiled triangular solve"""
+    print("\n[PDTRSM] Starting triangular solve...")
+
+    if side == "L":
+        if uplo == "L":
+            if trans == "N":
+                # Forward substitution
+                for k in range(B.mt):
+                    lalpha = alpha if k == 0 else 1.0
+                    for n in range(B.nt):
+                        DTRSM(
+                            ctx,
+                            A.block(k, k),
+                            B.block(k, n),
+                            side="L",
+                            uplo="L",
+                            transa="N",
+                            diag=diag,
+                            alpha=lalpha,
+                        )
+                    for m in range(k + 1, B.mt):
+                        for n in range(B.nt):
+                            DGEMM(
+                                ctx,
+                                A.block(m, k),
+                                B.block(k, n),
+                                B.block(m, n),
+                                transa="N",
+                                transb="N",
+                                alpha=-1.0,
+                                beta=lalpha,
+                            )
+            else:  # trans == 'T' or 'C'
+                # Backward substitution
+                for k in range(B.mt):
+                    lalpha = alpha if k == 0 else 1.0
+                    row_idx = B.mt - k - 1
+                    for n in range(B.nt):
+                        DTRSM(
+                            ctx,
+                            A.block(row_idx, row_idx),
+                            B.block(row_idx, n),
+                            side="L",
+                            uplo="L",
+                            transa="T",
+                            diag=diag,
+                            alpha=lalpha,
+                        )
+                    for m in range(k + 1, B.mt):
+                        m_idx = B.mt - 1 - m
+                        for n in range(B.nt):
+                            DGEMM(
+                                ctx,
+                                A.block(row_idx, m_idx),
+                                B.block(row_idx, n),
+                                B.block(m_idx, n),
+                                transa="T",
+                                transb="N",
+                                alpha=-1.0,
+                                beta=lalpha,
+                            )
+
+    print("[PDTRSM] Completed")
+
+
+def PDPOTRS(ctx, A, B, uplo="L"):
+    """Solve A @ X = B where A is factored by Cholesky (A = L @ L.T)"""
+    print("\n[PDPOTRS] Solving linear system...")
+
+    # First solve: L @ Y = B
+    PDTRSM(
+        ctx,
+        A,
+        B,
+        side="L",
+        uplo=uplo,
+        trans="N" if uplo == "L" else "T",
+        diag="N",
+        alpha=1.0,
+    )
+
+    # Second solve: L.T @ X = Y
+    PDTRSM(
+        ctx,
+        A,
+        B,
+        side="L",
+        uplo=uplo,
+        trans="T" if uplo == "L" else "N",
+        diag="N",
+        alpha=1.0,
+    )
+
+    print("[PDPOTRS] Completed")
+
+
+def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
+    """Parallel tiled matrix multiplication"""
+    print("\n[PDGEMM] Starting matrix multiplication...")
+
+    for m in range(C.mt):
+        for n in range(C.nt):
+            inner_k = A.nt if transa == "N" else A.mt
+
+            if alpha == 0.0 or inner_k == 0:
+                # Just scale C
+                DGEMM(
+                    ctx,
+                    A.block(0, 0),
+                    B.block(0, 0),
+                    C.block(m, n),
+                    transa=transa,
+                    transb=transb,
+                    alpha=0.0,
+                    beta=beta,
+                )
+            elif transa == "N":
+                if transb == "N":
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(m, k),
+                            B.block(k, n),
+                            C.block(m, n),
+                            transa="N",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+                else:
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(m, k),
+                            B.block(n, k),
+                            C.block(m, n),
+                            transa="N",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+            else:
+                if transb == "N":
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(k, m),
+                            B.block(k, n),
+                            C.block(m, n),
+                            transa="T",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+                else:
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(k, m),
+                            B.block(n, k),
+                            C.block(m, n),
+                            transa="T",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+
+    print("[PDGEMM] Completed")
+
+
+def compute_norm(ctx, matrix):
+    """Compute Frobenius norm of matrix using host tasks"""
+    norm_sq = 0.0
+
+    for colb in range(matrix.nt):
+        low_rowb = colb if matrix.sym_matrix else 0
+        for rowb in range(low_rowb, matrix.mt):
+            handle = matrix.handle(rowb, colb)
+
+            # Host task to read the block and compute norm
+            def compute_block_norm(h_block):
+                nonlocal norm_sq
+                norm_sq += np.sum(h_block * h_block)
+
+            with ctx.task(stf.exec_place.host(), handle.read()) as t:
+                # Synchronize the stream before reading data
+                cp.cuda.runtime.streamSynchronize(t.stream_ptr())
+
+                h_block = cai_to_numpy(t.get_arg_cai(0))
+                compute_block_norm(h_block)
+
+    return np.sqrt(norm_sq)
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Tiled Cholesky decomposition with CUDA STF"
+    )
+    parser.add_argument(
+        "N", type=int, nargs="?", default=1024, help="Matrix size (default: 1024)"
+    )
+    parser.add_argument(
+        "NB", type=int, nargs="?", default=128, help="Block size (default: 128)"
+    )
+    parser.add_argument("--check", action="store_true", help="Check result (slower)")
+    args = parser.parse_args()
+
+    N = args.N
+    NB = args.NB
+    check_result = args.check
+
+    assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}"
+
+    print("=" * 60)
+    print("Tiled Cholesky Decomposition with CUDA STF + CuPy")
+    print("=" * 60)
+    print(f"Matrix size: {N}x{N}")
+    print(f"Block size: {NB}x{NB}")
+    print(f"Number of blocks: {N // NB}x{N // NB}")
+    print(f"Check result: {check_result}")
+    print("=" * 60)
+
+    # Create STF context
+    ctx = stf.context()
+
+    # Create matrices
+    A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A")
+
+    if check_result:
+        Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref")
+
+    # Fill with Hilbert matrix + diagonal dominance
+    # H_{i,j} = 1/(i+j+1) + 2*N if i==j
+    def hilbert(row, col):
+        return 1.0 / (row + col + 1.0) + (2.0 * N if row == col else 0.0)
+
+    print("\n" + "=" * 60)
+    print("Initializing matrices...")
+    print("=" * 60)
+
+    A.fill(hilbert)
+    if check_result:
+        Aref.fill(hilbert)
+
+    # Create right-hand side
+    if check_result:
+        B = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B")
+        Bref = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref")
+
+        def rhs_vals(row, col):
+            return 1.0 * (row + 1)
+
+        B.fill(rhs_vals)
+        Bref.fill(rhs_vals)
+
+        # Compute ||B|| for residual calculation
+        Bref_norm = compute_norm(ctx, Bref)
+
+    # Synchronize before timing
+    cp.cuda.runtime.deviceSynchronize()
+
+    # Record start time
+    start_event = cp.cuda.Event()
+    stop_event = cp.cuda.Event()
+    start_event.record()
+
+    # Perform Cholesky factorization
+    print("\n" + "=" * 60)
+    print("Performing Cholesky factorization...")
+    print("=" * 60)
+    PDPOTRF(ctx, A)
+
+    # Record stop time
+    stop_event.record()
+
+    # Solve system if checking
+    if check_result:
+        print("\n" + "=" * 60)
+        print("Solving linear system...")
+        print("=" * 60)
+        PDPOTRS(ctx, A, B, uplo="L")
+
+        print("\n" + "=" * 60)
+        print("Computing residual...")
+        print("=" * 60)
+        # Compute residual: Bref = Aref @ B - Bref
+        PDGEMM(ctx, Aref, B, Bref, transa="N", transb="N", alpha=1.0, beta=-1.0)
+
+        # Compute ||residual||
+        res_norm = compute_norm(ctx, Bref)
+
+    # Finalize STF context
+    print("\n" + "=" * 60)
+    print("Finalizing STF context...")
+    print("=" * 60)
+    ctx.finalize()
+
+    # Wait for completion
+    stop_event.synchronize()
+
+    # Compute timing
+    elapsed_ms = cp.cuda.get_elapsed_time(start_event, stop_event)
+    gflops = (1.0 / 3.0 * N * N * N) / 1e9
+    gflops_per_sec = gflops / (elapsed_ms / 1000.0)
+
+    print("\n" + "=" * 60)
+    print("Results")
+    print("=" * 60)
+    print(f"[PDPOTRF] Elapsed time: {elapsed_ms:.2f} ms")
+    print(f"[PDPOTRF] Performance: {gflops_per_sec:.2f} GFLOPS")
+
+    if check_result:
+        residual = res_norm / Bref_norm
+        print(f"\n[POTRS] ||AX - B||: {res_norm:.6e}")
+        print(f"[POTRS] ||B||: {Bref_norm:.6e}")
+        print(f"[POTRS] Residual (||AX - B||/||B||): {residual:.6e}")
+
+        if residual >= 0.01:
+            print("\n❌ Algorithm did not converge (residual >= 0.01)")
+            return 1
+        else:
+            print("\n✅ Algorithm converged successfully!")
+
+    print("=" * 60)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/cuda_cccl/tests/stf/example_fluid_warp.py b/python/cuda_cccl/tests/stf/example_fluid_warp.py
new file mode 100644
index 00000000000..7a73351df11
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/example_fluid_warp.py
@@ -0,0 +1,405 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+###########################################################################
+# Example Fluid
+#
+# Shows how to implement a simple 2D Stable Fluids solver using
+# multidimensional arrays and launches.
+#
+###########################################################################
+
+import math
+
+import warp as wp
+import warp.render
+
+import cuda.stf as stf
+
+
+# Add a stf-specific decorator to the wp. namespace
+def stf_kernel(pyfunc):
+    # let warp decorate normally
+    kernel = wp.kernel(pyfunc)
+
+    # attach an STF-aware call operator
+    def _stf_call(*args, dim=None, stream=None, **kwargs):
+        return wp.stf.launch(kernel, dim=dim, inputs=args, stream=stream, **kwargs)
+
+    # monkey-patch a method onto the kernel object
+    kernel.stf = _stf_call
+
+    return kernel
+
+
+def stf_launch(kernel, dim, inputs=None, stream=None, **kwargs):
+    # just forward to warp for now
+    return wp.launch(
+        kernel,
+        dim=dim,
+        inputs=inputs,
+        stream=stream,
+        **kwargs,
+    )
+
+
+# put it under wp.stf
+if not hasattr(wp, "stf"):
+
+    class _stf:
+        pass
+
+    wp.stf = _stf()
+
+
+wp.stf.kernel = stf_kernel
+wp.stf.launch = stf_launch
+
+grid_width = wp.constant(256)
+grid_height = wp.constant(128)
+
+
+@wp.func
+def lookup_float(f: wp.array2d(dtype=float), x: int, y: int):
+    x = wp.clamp(x, 0, grid_width - 1)
+    y = wp.clamp(y, 0, grid_height - 1)
+
+    return f[x, y]
+
+
+@wp.func
+def sample_float(f: wp.array2d(dtype=float), x: float, y: float):
+    lx = int(wp.floor(x))
+    ly = int(wp.floor(y))
+
+    tx = x - float(lx)
+    ty = y - float(ly)
+
+    s0 = wp.lerp(lookup_float(f, lx, ly), lookup_float(f, lx + 1, ly), tx)
+    s1 = wp.lerp(lookup_float(f, lx, ly + 1), lookup_float(f, lx + 1, ly + 1), tx)
+
+    s = wp.lerp(s0, s1, ty)
+    return s
+
+
+@wp.func
+def lookup_vel(f: wp.array2d(dtype=wp.vec2), x: int, y: int):
+    if x < 0 or x >= grid_width:
+        return wp.vec2()
+    if y < 0 or y >= grid_height:
+        return wp.vec2()
+
+    return f[x, y]
+
+
+@wp.func
+def sample_vel(f: wp.array2d(dtype=wp.vec2), x: float, y: float):
+    lx = int(wp.floor(x))
+    ly = int(wp.floor(y))
+
+    tx = x - float(lx)
+    ty = y - float(ly)
+
+    s0 = wp.lerp(lookup_vel(f, lx, ly), lookup_vel(f, lx + 1, ly), tx)
+    s1 = wp.lerp(lookup_vel(f, lx, ly + 1), lookup_vel(f, lx + 1, ly + 1), tx)
+
+    s = wp.lerp(s0, s1, ty)
+    return s
+
+
+@wp.stf.kernel
+def advect(
+    u0: wp.array2d(dtype=wp.vec2),
+    u1: wp.array2d(dtype=wp.vec2),
+    rho0: wp.array2d(dtype=float),
+    rho1: wp.array2d(dtype=float),
+    dt: float,
+):
+    i, j = wp.tid()
+
+    u = u0[i, j]
+
+    # trace backward
+    p = wp.vec2(float(i), float(j))
+    p = p - u * dt
+
+    # advect
+    u1[i, j] = sample_vel(u0, p[0], p[1])
+    rho1[i, j] = sample_float(rho0, p[0], p[1])
+
+
+@wp.stf.kernel
+def divergence(u: wp.array2d(dtype=wp.vec2), div: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+
+    if i == grid_width - 1:
+        return
+    if j == grid_height - 1:
+        return
+
+    dx = (u[i + 1, j][0] - u[i, j][0]) * 0.5
+    dy = (u[i, j + 1][1] - u[i, j][1]) * 0.5
+
+    div[i, j] = dx + dy
+
+
+@wp.stf.kernel
+def pressure_solve(
+    p0: wp.array2d(dtype=float),
+    p1: wp.array2d(dtype=float),
+    div: wp.array2d(dtype=float),
+):
+    i, j = wp.tid()
+
+    s1 = lookup_float(p0, i - 1, j)
+    s2 = lookup_float(p0, i + 1, j)
+    s3 = lookup_float(p0, i, j - 1)
+    s4 = lookup_float(p0, i, j + 1)
+
+    # Jacobi update
+    err = s1 + s2 + s3 + s4 - div[i, j]
+
+    p1[i, j] = err * 0.25
+
+
+@wp.stf.kernel
+def pressure_apply(p: wp.array2d(dtype=float), u: wp.array2d(dtype=wp.vec2)):
+    i, j = wp.tid()
+
+    if i == 0 or i == grid_width - 1:
+        return
+    if j == 0 or j == grid_height - 1:
+        return
+
+    # pressure gradient
+    f_p = wp.vec2(p[i + 1, j] - p[i - 1, j], p[i, j + 1] - p[i, j - 1]) * 0.5
+
+    u[i, j] = u[i, j] - f_p
+
+
+@wp.stf.kernel
+def integrate(u: wp.array2d(dtype=wp.vec2), rho: wp.array2d(dtype=float), dt: float):
+    i, j = wp.tid()
+
+    # gravity
+    f_g = wp.vec2(-90.8, 0.0) * rho[i, j]
+
+    # integrate
+    u[i, j] = u[i, j] + dt * f_g
+
+    # fade
+    rho[i, j] = rho[i, j] * (1.0 - 0.1 * dt)
+
+
+@wp.stf.kernel
+def init(
+    rho: wp.array2d(dtype=float),
+    u: wp.array2d(dtype=wp.vec2),
+    radius: int,
+    dir: wp.vec2,
+):
+    i, j = wp.tid()
+
+    d = wp.length(wp.vec2(float(i - grid_width / 2), float(j - grid_height / 2)))
+
+    if d < radius:
+        rho[i, j] = 1.0
+        u[i, j] = dir
+
+
+class Example:
+    def __init__(self):
+        fps = 60
+        self.frame_dt = 1.0 / fps
+        self.sim_substeps = 2
+        self.iterations = 100  # Number of pressure iterations
+        self.sim_dt = self.frame_dt / self.sim_substeps
+        self.sim_time = 0.0
+
+        self._stf_ctx = stf.context()
+
+        shape = (grid_width, grid_height)
+
+        self.u0 = wp.zeros(shape, dtype=wp.vec2)
+        self.u1 = wp.zeros(shape, dtype=wp.vec2)
+
+        self.rho0 = wp.zeros(shape, dtype=float)
+        self.rho1 = wp.zeros(shape, dtype=float)
+
+        self.p0 = wp.zeros(shape, dtype=float)
+        self.p1 = wp.zeros(shape, dtype=float)
+        self.div = wp.zeros(shape, dtype=float)
+
+        # Create STF logical data from Warp arrays with explicit data place
+        # Warp arrays are on GPU device memory, so specify data_place.device()
+
+        # For regular float arrays, specify device data place
+        device_place = stf.data_place.device(0)
+
+        self.rho0._stf_ld = self._stf_ctx.logical_data(
+            self.rho0, device_place, name="density_current"
+        )
+        self.rho1._stf_ld = self._stf_ctx.logical_data(
+            self.rho1, device_place, name="density_next"
+        )
+        self.p0._stf_ld = self._stf_ctx.logical_data(
+            self.p0, device_place, name="pressure_current"
+        )
+        self.p1._stf_ld = self._stf_ctx.logical_data(
+            self.p1, device_place, name="pressure_next"
+        )
+        self.div._stf_ld = self._stf_ctx.logical_data(
+            self.div, device_place, name="velocity_divergence"
+        )
+
+        # vec2 arrays - STF now automatically handles vector type flattening
+        # Store STF logical data consistently with other arrays
+        self.u0._stf_ld = self._stf_ctx.logical_data(
+            self.u0, device_place, name="velocity_current"
+        )
+        self.u1._stf_ld = self._stf_ctx.logical_data(
+            self.u1, device_place, name="velocity_next"
+        )
+
+        # Set Warp array names (for Warp tracing)
+        self.u0._name = "u0"
+        self.u1._name = "u1"
+        self.rho0._name = "rho0"
+        self.rho1._name = "rho1"
+        self.p0._name = "p0"
+        self.p1._name = "p1"
+        self.div._name = "div"
+
+        # capture pressure solve as a CUDA graph
+        self.use_cuda_graph = wp.get_device().is_cuda
+        if self.use_cuda_graph:
+            with wp.ScopedCapture() as capture:
+                self.pressure_iterations()
+            self.graph = capture.graph
+
+    def step(self):
+        with wp.ScopedTimer("step"):
+            for _ in range(self.sim_substeps):
+                shape = (grid_width, grid_height)
+                dt = self.sim_dt
+
+                speed = 400.0
+                angle = math.sin(self.sim_time * 4.0) * 1.5
+                vel = wp.vec2(math.cos(angle) * speed, math.sin(angle) * speed)
+
+                # update emitters
+                wp.stf.launch(init, dim=shape, inputs=[self.rho0, self.u0, 5, vel])
+
+                # force integrate
+                wp.stf.launch(integrate, dim=shape, inputs=[self.u0, self.rho0, dt])
+                wp.stf.launch(divergence, dim=shape, inputs=[self.u0, self.div])
+
+                # pressure solve
+                self.p0.zero_()
+                self.p1.zero_()
+
+                # TODO experiment with explicit capture at Warp level
+                # if self.use_cuda_graph:
+                #     wp.capture_launch(self.graph)
+                # else:
+                #     self.pressure_iterations()
+                self.pressure_iterations()
+
+                # velocity update
+                wp.stf.launch(pressure_apply, dim=shape, inputs=[self.p0, self.u0])
+
+                # semi-Lagrangian advection
+                wp.stf.launch(
+                    advect,
+                    dim=shape,
+                    inputs=[self.u0, self.u1, self.rho0, self.rho1, dt],
+                )
+
+                # swap buffers
+                (self.u0, self.u1) = (self.u1, self.u0)
+                (self.rho0, self.rho1) = (self.rho1, self.rho0)
+
+                self.sim_time += dt
+
+    def pressure_iterations(self):
+        for _ in range(self.iterations):
+            wp.stf.launch(
+                pressure_solve, dim=self.p0.shape, inputs=[self.p0, self.p1, self.div]
+            )
+
+            # swap pressure fields
+            (self.p0, self.p1) = (self.p1, self.p0)
+
+    def step_and_render_frame(self, frame_num=None, img=None):
+        self.step()
+
+        with wp.ScopedTimer("render"):
+            if img:
+                img.set_array(self.rho0.numpy())
+
+        return (img,)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--device", type=str, default=None, help="Override the default Warp device."
+    )
+    parser.add_argument(
+        "--num_frames", type=int, default=100000, help="Total number of frames."
+    )
+    parser.add_argument(
+        "--headless",
+        action="store_true",
+        help="Run in headless mode, suppressing the opening of any graphical windows.",
+    )
+
+    args = parser.parse_known_args()[0]
+
+    with wp.ScopedDevice(args.device):
+        example = Example()
+
+        if args.headless:
+            for _ in range(args.num_frames):
+                example.step()
+        else:
+            import matplotlib
+            import matplotlib.animation as anim
+            import matplotlib.pyplot as plt
+
+            fig = plt.figure()
+
+            img = plt.imshow(
+                example.rho0.numpy(),
+                origin="lower",
+                animated=True,
+                interpolation="antialiased",
+            )
+            img.set_norm(matplotlib.colors.Normalize(0.0, 1.0))
+            seq = anim.FuncAnimation(
+                fig,
+                example.step_and_render_frame,
+                fargs=(img,),
+                frames=args.num_frames,
+                blit=True,
+                interval=8,
+                repeat=False,
+            )
+
+            plt.show()
diff --git a/python/cuda_cccl/tests/stf/example_potri.py b/python/cuda_cccl/tests/stf/example_potri.py
new file mode 100644
index 00000000000..ed2e8c1e91b
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/example_potri.py
@@ -0,0 +1,836 @@
+#!/usr/bin/env python3
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Python implementation of POTRI (matrix inversion via Cholesky) using CUDA STF and CuPy.
+
+POTRI computes the inverse of a symmetric positive definite matrix using its Cholesky factorization:
+1. Cholesky factorization: A = L*L^T
+2. Triangular inversion: L^(-1)
+3. Compute A^(-1) = L^(-T) * L^(-1)
+
+This example demonstrates:
+- Tiled matrix operations with STF logical data
+- Integration of CuPy's CUBLAS and CUSOLVER functions with STF tasks
+- Multi-device execution with automatic data placement
+- Task-based parallelism for linear algebra operations
+"""
+
+import sys
+
+import cupy as cp
+import numpy as np
+from cupyx.scipy import linalg as cp_linalg
+
+import cuda.stf as stf
+
+
+class CAIWrapper:
+    """Wrapper to expose CUDA Array Interface dict as a proper CAI object."""
+
+    def __init__(self, cai_dict):
+        self.__cuda_array_interface__ = cai_dict
+
+
+def get_cupy_arrays(task):
+    """
+    Get all CuPy arrays from STF task arguments.
+
+    Usage:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+    """
+    arrays = []
+    idx = 0
+    while True:
+        try:
+            arrays.append(cp.asarray(CAIWrapper(task.get_arg_cai(idx))))
+            idx += 1
+        except Exception:
+            break
+    return tuple(arrays) if len(arrays) > 1 else arrays[0] if arrays else None
+
+
+def cai_to_numpy(cai_dict):
+    """Convert CUDA Array Interface dict to NumPy array (for host memory)."""
+    import ctypes
+
+    # Extract CAI fields
+    data_ptr, readonly = cai_dict["data"]
+    shape = cai_dict["shape"]
+    typestr = cai_dict["typestr"]
+
+    # Convert typestr to NumPy dtype
+    dtype = np.dtype(typestr)
+
+    # Calculate total size in bytes
+    itemsize = dtype.itemsize
+    size = np.prod(shape) * itemsize
+
+    # Create ctypes buffer from pointer
+    buffer = (ctypes.c_byte * size).from_address(data_ptr)
+
+    # Create NumPy array from buffer
+    arr = np.frombuffer(buffer, dtype=dtype).reshape(shape)
+
+    return arr
+
+
+class BlockRef:
+    """Reference to a specific block in a tiled matrix."""
+
+    def __init__(self, matrix, row, col):
+        self.matrix = matrix
+        self.row = row
+        self.col = col
+        self._handle = matrix.handle(row, col)
+        self._devid = matrix.get_preferred_devid(row, col)
+
+    def handle(self):
+        """Get the STF logical data handle for this block."""
+        return self._handle
+
+    def devid(self):
+        """Get the preferred device ID for this block."""
+        return self._devid
+
+    def __repr__(self):
+        return f"BlockRef({self.matrix.symbol}[{self.row},{self.col}])"
+
+
+class TiledMatrix:
+    """
+    Tiled matrix class that splits a matrix into blocks for parallel processing.
+    Each block is managed as an STF logical data object.
+    Uses tiled storage format for contiguous blocks.
+    """
+
+    def __init__(
+        self,
+        ctx,
+        nrows,
+        ncols,
+        blocksize_rows,
+        blocksize_cols,
+        is_symmetric=False,
+        symbol="matrix",
+        dtype=np.float64,
+    ):
+        self.ctx = ctx
+        self.symbol = symbol
+        self.dtype = dtype
+        self.sym_matrix = is_symmetric
+
+        self.m = nrows
+        self.n = ncols
+        self.mb = blocksize_rows
+        self.nb = blocksize_cols
+
+        assert self.m % self.mb == 0, (
+            f"nrows {nrows} must be divisible by blocksize_rows {blocksize_rows}"
+        )
+        assert self.n % self.nb == 0, (
+            f"ncols {ncols} must be divisible by blocksize_cols {blocksize_cols}"
+        )
+
+        # Number of blocks
+        self.mt = self.m // self.mb
+        self.nt = self.n // self.nb
+
+        # Allocate pinned host memory for faster transfers (in tiled format)
+        self.h_array = cp.cuda.alloc_pinned_memory(
+            self.m * self.n * np.dtype(dtype).itemsize
+        )
+        self.h_array_np = np.frombuffer(self.h_array, dtype=dtype).reshape(
+            self.m, self.n
+        )
+
+        # Dictionary to store logical data handles for each block
+        self.handles = {}
+
+        # Determine device layout
+        self.ndevs = cp.cuda.runtime.getDeviceCount()
+        self.grid_p, self.grid_q = self._compute_device_grid(self.ndevs)
+
+        print(
+            f"[{self.symbol}] {self.m}x{self.n} matrix, {self.mt}x{self.nt} blocks of {self.mb}x{self.nb}"
+        )
+        print(
+            f"[{self.symbol}] Using {self.ndevs} devices in {self.grid_p}x{self.grid_q} grid"
+        )
+
+    def _compute_device_grid(self, ndevs):
+        """Compute 2D device grid dimensions (as close to square as possible)"""
+        grid_p = 1
+        grid_q = ndevs
+        for a in range(1, int(np.sqrt(ndevs)) + 1):
+            if ndevs % a == 0:
+                grid_p = a
+                grid_q = ndevs // a
+        return grid_p, grid_q
+
+    def get_preferred_devid(self, row, col):
+        """Get preferred device ID for a given block using cyclic distribution"""
+        return (row % self.grid_p) + (col % self.grid_q) * self.grid_p
+
+    def handle(self, row, col):
+        """Get the logical data handle for a block."""
+        return self.handles[(row, col)]
+
+    def block(self, row, col):
+        """Get a BlockRef for block (row, col)"""
+        return BlockRef(self, row, col)
+
+    def _get_index(self, row, col):
+        """Convert (row, col) to linear index in tiled storage"""
+        tile_row = row // self.mb
+        tile_col = col // self.nb
+        tile_size = self.mb * self.nb
+        tile_start = (tile_row + self.mt * tile_col) * tile_size
+        offset = (row % self.mb) + (col % self.nb) * self.mb
+        return tile_start + offset
+
+    def _get_block_h(self, brow, bcol):
+        """Get a view of the host data for block (brow, bcol)"""
+        # For tiled storage, blocks are stored contiguously
+        start_idx = (brow + self.mt * bcol) * self.mb * self.nb
+        end_idx = start_idx + self.mb * self.nb
+        flat_view = self.h_array_np.ravel()
+        return flat_view[start_idx:end_idx].reshape(self.mb, self.nb)
+
+    def fill(self, func):
+        """
+        Fill the matrix blocks using a function func(row, col) -> value.
+        Creates STF logical data from host arrays and lets STF handle transfers.
+        """
+        print(f"[{self.symbol}] Filling matrix on host...")
+        for colb in range(self.nt):
+            low_rowb = colb if self.sym_matrix else 0
+            for rowb in range(low_rowb, self.mt):
+                # Fill host block
+                h_block = self._get_block_h(rowb, colb)
+                for lrow in range(self.mb):
+                    for lcol in range(self.nb):
+                        row = lrow + rowb * self.mb
+                        col = lcol + colb * self.nb
+                        h_block[lrow, lcol] = func(row, col)
+
+                handle = self.ctx.logical_data(
+                    h_block, name=f"{self.symbol}_{rowb}_{colb}"
+                )
+                self.handles[(rowb, colb)] = handle
+
+
+# ============================================================================
+# Block-level operations (BLAS/LAPACK)
+# ============================================================================
+
+
+def DPOTRF(ctx, a):
+    """Cholesky factorization of a diagonal block: A = L*L^T (lower triangular)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            d_block[:] = cp.linalg.cholesky(d_block)
+
+
+def DTRSM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0):
+    """Triangular solve: B = alpha * op(A)^(-1) * B"""
+    with ctx.task(
+        stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()
+    ) as t:
+        d_a, d_b = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            lower = uplo == "L"
+            trans = transa != "N"
+            result = cp_linalg.solve_triangular(d_a, d_b, lower=lower, trans=trans)
+            if alpha != 1.0:
+                d_b[:] = alpha * result
+            else:
+                d_b[:] = result
+
+
+def DTRTRI(ctx, a, uplo="L", diag="N"):
+    """Triangular matrix inversion: A = A^(-1)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            lower = uplo == "L"
+            unit_diagonal = diag == "U"
+            # CuPy doesn't have trtri directly, use solve with identity
+            n = d_block.shape[0]
+            identity = cp.eye(n, dtype=d_block.dtype)
+            d_block[:] = cp_linalg.solve_triangular(
+                d_block, identity, lower=lower, unit_diagonal=unit_diagonal
+            )
+
+
+def DGEMM(ctx, a, b, c, transa="N", transb="N", alpha=1.0, beta=1.0):
+    """General matrix multiplication: C = alpha * op(A) * op(B) + beta * C"""
+    with ctx.task(
+        stf.exec_place.device(c.devid()),
+        a.handle().read(),
+        b.handle().read(),
+        c.handle().rw(),
+    ) as t:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if transa != "N" else d_a
+            op_b = d_b.T if transb != "N" else d_b
+
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_b)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_b)
+            else:
+                d_c[:] = alpha * (op_a @ op_b) + beta * d_c
+
+
+def DSYRK(ctx, a, c, uplo="L", trans="N", alpha=1.0, beta=1.0):
+    """Symmetric rank-k update: C = alpha * op(A) @ op(A).T + beta * C"""
+    with ctx.task(
+        stf.exec_place.device(c.devid()), a.handle().read(), c.handle().rw()
+    ) as t:
+        d_a, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            op_a = d_a.T if trans != "N" else d_a
+
+            if beta == 0.0:
+                d_c[:] = alpha * (op_a @ op_a.T)
+            elif beta == 1.0:
+                d_c[:] += alpha * (op_a @ op_a.T)
+            else:
+                d_c[:] = alpha * (op_a @ op_a.T) + beta * d_c
+
+
+def DTRMM(ctx, a, b, side="L", uplo="L", transa="N", diag="N", alpha=1.0):
+    """Triangular matrix multiplication: B = alpha * op(A) * B (side='L') or B = alpha * B * op(A) (side='R')"""
+    with ctx.task(
+        stf.exec_place.device(b.devid()), a.handle().read(), b.handle().rw()
+    ) as t:
+        d_a, d_b = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            lower = uplo == "L"
+            trans = transa != "N"
+
+            # Extract triangle from A
+            if lower:
+                tri_a = cp.tril(d_a)
+            else:
+                tri_a = cp.triu(d_a)
+
+            if trans:
+                tri_a = tri_a.T
+
+            if side == "L":
+                d_b[:] = alpha * (tri_a @ d_b)
+            else:  # side == 'R'
+                d_b[:] = alpha * (d_b @ tri_a)
+
+
+def DSYMM(ctx, a, b, c, side="L", uplo="L", alpha=1.0, beta=1.0):
+    """Symmetric matrix multiplication: C = alpha * A * B + beta * C (side='L') or C = alpha * B * A + beta * C (side='R')
+    where A is symmetric."""
+    with ctx.task(
+        stf.exec_place.device(c.devid()),
+        a.handle().read(),
+        b.handle().read(),
+        c.handle().rw(),
+    ) as t:
+        d_a, d_b, d_c = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            # Reconstruct full symmetric matrix from lower/upper triangle
+            if uplo == "L":
+                # Lower triangle is stored
+                sym_a = cp.tril(d_a) + cp.tril(d_a, -1).T
+            else:
+                # Upper triangle is stored
+                sym_a = cp.triu(d_a) + cp.triu(d_a, 1).T
+
+            if side == "L":
+                result = alpha * (sym_a @ d_b)
+            else:  # side == 'R'
+                result = alpha * (d_b @ sym_a)
+
+            if beta == 0.0:
+                d_c[:] = result
+            elif beta == 1.0:
+                d_c[:] += result
+            else:
+                d_c[:] = result + beta * d_c
+
+
+# ============================================================================
+# Tiled operations
+# ============================================================================
+
+
+def PDPOTRF(ctx, A, uplo="L"):
+    """Parallel tiled Cholesky factorization"""
+    print("\n[PDPOTRF] Starting Cholesky factorization...")
+    assert uplo == "L", "Only lower triangular factorization supported"
+
+    for k in range(A.nt):
+        # Factorize diagonal block
+        DPOTRF(ctx, A.block(k, k))
+
+        # Update column below diagonal
+        for m in range(k + 1, A.mt):
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(m, k),
+                side="R",
+                uplo="L",
+                transa="T",
+                diag="N",
+                alpha=1.0,
+            )
+
+        # Update trailing submatrix
+        for n in range(k + 1, A.nt):
+            DSYRK(
+                ctx,
+                A.block(n, k),
+                A.block(n, n),
+                uplo="L",
+                trans="N",
+                alpha=-1.0,
+                beta=1.0,
+            )
+
+            for m in range(n + 1, A.mt):
+                DGEMM(
+                    ctx,
+                    A.block(m, k),
+                    A.block(n, k),
+                    A.block(m, n),
+                    transa="N",
+                    transb="T",
+                    alpha=-1.0,
+                    beta=1.0,
+                )
+
+    print("[PDPOTRF] Completed")
+
+
+def PDTRTRI(ctx, A, uplo="L", diag="N"):
+    """Parallel tiled triangular matrix inversion"""
+    print("\n[PDTRTRI] Starting triangular inversion...")
+    assert uplo == "L", "Only lower triangular inversion supported"
+
+    for k in range(A.nt):
+        # Step 1: Update A[m,k] for m > k
+        for m in range(k + 1, A.mt):
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(m, k),
+                side="R",
+                uplo="L",
+                transa="N",
+                diag=diag,
+                alpha=-1.0,
+            )
+
+        # Step 2: Update A[m,n] for m > k, n < k
+        for m in range(k + 1, A.mt):
+            for n in range(k):
+                DGEMM(
+                    ctx,
+                    A.block(m, k),
+                    A.block(k, n),
+                    A.block(m, n),
+                    transa="N",
+                    transb="N",
+                    alpha=1.0,
+                    beta=1.0,
+                )
+
+        # Step 3: Update A[k,n] for n < k
+        for n in range(k):
+            DTRSM(
+                ctx,
+                A.block(k, k),
+                A.block(k, n),
+                side="L",
+                uplo="L",
+                transa="N",
+                diag=diag,
+                alpha=1.0,
+            )
+
+        # Step 4: Invert diagonal block A[k,k]
+        DTRTRI(ctx, A.block(k, k), uplo=uplo, diag=diag)
+
+    print("[PDTRTRI] Completed")
+
+
+def DLAAUM(ctx, a, uplo="L"):
+    """Compute A^T * A for a triangular block (lauum operation)"""
+    with ctx.task(stf.exec_place.device(a.devid()), a.handle().rw()) as t:
+        d_block = get_cupy_arrays(t)
+        with cp.cuda.ExternalStream(t.stream_ptr()):
+            # lauum: compute L * L^T for lower triangular L
+            if uplo == "L":
+                L = cp.tril(d_block)
+                d_block[:] = L @ L.T
+            else:
+                U = cp.triu(d_block)
+                d_block[:] = U.T @ U
+
+
+def PDLAUUM(ctx, A, uplo="L"):
+    """Parallel tiled computation of A^T * A for lower triangular A"""
+    print("\n[PDLAUUM] Starting LAUUM (A^T * A)...")
+    assert uplo == "L", "Only lower triangular LAUUM supported"
+
+    for k in range(A.mt):
+        # Step 1: Update off-diagonal blocks
+        for n in range(k):
+            # Update A[n,n] with A[k,n]^T * A[k,n]
+            DSYRK(
+                ctx,
+                A.block(k, n),
+                A.block(n, n),
+                uplo="L",
+                trans="T",
+                alpha=1.0,
+                beta=1.0,
+            )
+
+            # Update A[m,n] with A[k,m]^T * A[k,n]
+            for m in range(n + 1, k):
+                DGEMM(
+                    ctx,
+                    A.block(k, m),
+                    A.block(k, n),
+                    A.block(m, n),
+                    transa="T",
+                    transb="N",
+                    alpha=1.0,
+                    beta=1.0,
+                )
+
+        # Step 2: Update A[k,n] = A[k,k]^T * A[k,n]
+        for n in range(k):
+            DTRMM(
+                ctx,
+                A.block(k, k),
+                A.block(k, n),
+                side="L",
+                uplo="L",
+                transa="T",
+                diag="N",
+                alpha=1.0,
+            )
+
+        # Step 3: Update diagonal block A[k,k] = A[k,k]^T * A[k,k]
+        DLAAUM(ctx, A.block(k, k), uplo=uplo)
+
+    print("[PDLAUUM] Completed")
+
+
+def PDGEMM(ctx, A, B, C, transa="N", transb="N", alpha=1.0, beta=1.0):
+    """Parallel tiled matrix multiplication"""
+    print("\n[PDGEMM] Starting matrix multiplication...")
+
+    for m in range(C.mt):
+        for n in range(C.nt):
+            inner_k = A.nt if transa == "N" else A.mt
+
+            if alpha == 0.0 or inner_k == 0:
+                # Just scale C
+                DGEMM(
+                    ctx,
+                    A.block(0, 0),
+                    B.block(0, 0),
+                    C.block(m, n),
+                    transa=transa,
+                    transb=transb,
+                    alpha=0.0,
+                    beta=beta,
+                )
+            elif transa == "N":
+                if transb == "N":
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(m, k),
+                            B.block(k, n),
+                            C.block(m, n),
+                            transa="N",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+                else:
+                    for k in range(A.nt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(m, k),
+                            B.block(n, k),
+                            C.block(m, n),
+                            transa="N",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+            else:  # transa in ['T', 'C']
+                if transb == "N":
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(k, m),
+                            B.block(k, n),
+                            C.block(m, n),
+                            transa="T",
+                            transb="N",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+                else:
+                    for k in range(A.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        DGEMM(
+                            ctx,
+                            A.block(k, m),
+                            B.block(n, k),
+                            C.block(m, n),
+                            transa="T",
+                            transb="T",
+                            alpha=alpha,
+                            beta=zbeta,
+                        )
+
+    print("[PDGEMM] Completed")
+
+
+def PDSYMM(ctx, A, B, C, side="L", uplo="L", alpha=1.0, beta=1.0):
+    """Parallel tiled symmetric matrix multiplication"""
+    print("\n[PDSYMM] Starting symmetric matrix multiplication...")
+
+    for m in range(C.mt):
+        for n in range(C.nt):
+            if side == "L":
+                if uplo == "L":
+                    for k in range(C.mt):
+                        zbeta = beta if k == 0 else 1.0
+                        if k < m:
+                            DGEMM(
+                                ctx,
+                                A.block(m, k),
+                                B.block(k, n),
+                                C.block(m, n),
+                                transa="N",
+                                transb="N",
+                                alpha=alpha,
+                                beta=zbeta,
+                            )
+                        else:
+                            if k == m:
+                                DSYMM(
+                                    ctx,
+                                    A.block(k, k),
+                                    B.block(k, n),
+                                    C.block(m, n),
+                                    side=side,
+                                    uplo=uplo,
+                                    alpha=alpha,
+                                    beta=zbeta,
+                                )
+                            else:
+                                DGEMM(
+                                    ctx,
+                                    A.block(k, m),
+                                    B.block(k, n),
+                                    C.block(m, n),
+                                    transa="T",
+                                    transb="N",
+                                    alpha=alpha,
+                                    beta=zbeta,
+                                )
+            else:  # side == 'R'
+                # Similar logic for right multiplication
+                pass
+
+    print("[PDSYMM] Completed")
+
+
+def compute_norm(ctx, matrix):
+    """Compute Frobenius norm of matrix using host tasks"""
+    norm_sq = 0.0
+
+    for colb in range(matrix.nt):
+        low_rowb = colb if matrix.sym_matrix else 0
+        for rowb in range(low_rowb, matrix.mt):
+            handle = matrix.handle(rowb, colb)
+
+            # Host task to read the block and compute norm
+            def compute_block_norm(h_block):
+                nonlocal norm_sq
+                norm_sq += np.sum(h_block * h_block)
+
+            with ctx.task(stf.exec_place.host(), handle.read()) as t:
+                # Synchronize the stream before reading data
+                cp.cuda.runtime.streamSynchronize(t.stream_ptr())
+
+                h_block = cai_to_numpy(t.get_arg_cai(0))
+                compute_block_norm(h_block)
+
+    return np.sqrt(norm_sq)
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Tiled POTRI (matrix inversion via Cholesky) with CUDA STF"
+    )
+    parser.add_argument(
+        "N", type=int, nargs="?", default=512, help="Matrix size (default: 512)"
+    )
+    parser.add_argument(
+        "NB", type=int, nargs="?", default=128, help="Block size (default: 128)"
+    )
+    parser.add_argument("--check", action="store_true", help="Check result (slower)")
+    args = parser.parse_args()
+
+    N = args.N
+    NB = args.NB
+    check_result = args.check
+
+    assert N % NB == 0, f"Matrix size {N} must be divisible by block size {NB}"
+
+    print("=" * 60)
+    print("Tiled POTRI (Matrix Inversion) with CUDA STF + CuPy")
+    print("=" * 60)
+    print(f"Matrix size: {N}x{N}")
+    print(f"Block size: {NB}x{NB}")
+    print(f"Number of blocks: {N // NB}x{N // NB}")
+    print(f"Check result: {check_result}")
+    print("=" * 60)
+
+    # Create STF context
+    ctx = stf.context()
+
+    # Create matrices
+    A = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=True, symbol="A")
+
+    if check_result:
+        Aref = TiledMatrix(ctx, N, N, NB, NB, is_symmetric=False, symbol="Aref")
+
+    print("\n" + "=" * 60)
+    print("Initializing matrices...")
+    print("=" * 60)
+
+    # Hilbert matrix + diagonal dominance for numerical stability
+    def hilbert(row, col):
+        return 1.0 / (col + row + 1.0) + 2.0 * N * (col == row)
+
+    A.fill(hilbert)
+    if check_result:
+        Aref.fill(hilbert)
+
+    # Measure performance
+    import time
+
+    start_time = time.time()
+
+    print("\n" + "=" * 60)
+    print("Performing POTRI (inversion via Cholesky)...")
+    print("=" * 60)
+
+    # Step 1: Cholesky factorization A = L*L^T
+    PDPOTRF(ctx, A, uplo="L")
+
+    # Step 2: Triangular inversion L^(-1)
+    PDTRTRI(ctx, A, uplo="L", diag="N")
+
+    # Step 3: Compute A^(-1) = L^(-T) * L^(-1)
+    PDLAUUM(ctx, A, uplo="L")
+
+    if check_result:
+        print("\n" + "=" * 60)
+        print("Verifying result...")
+        print("=" * 60)
+
+        # Create test vector B
+        B_potri = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_potri")
+        Bref_potri = TiledMatrix(
+            ctx, N, 1, NB, 1, is_symmetric=False, symbol="Bref_potri"
+        )
+
+        def rhs_vals(row, col):
+            return 1.0 * (row + 1)
+
+        B_potri.fill(rhs_vals)
+        Bref_potri.fill(rhs_vals)
+
+        # Compute norm of B
+        b_norm = compute_norm(ctx, Bref_potri)
+
+        # Create temporary matrix for result
+        B_tmp = TiledMatrix(ctx, N, 1, NB, 1, is_symmetric=False, symbol="B_tmp")
+
+        def zero_vals(row, col):
+            return 0.0
+
+        B_tmp.fill(zero_vals)
+
+        # Compute B_tmp = A^(-1) * B
+        PDSYMM(ctx, A, B_potri, B_tmp, side="L", uplo="L", alpha=1.0, beta=0.0)
+
+        # Compute residual: Bref = Aref * B_tmp - Bref
+        PDGEMM(
+            ctx, Aref, B_tmp, Bref_potri, transa="N", transb="N", alpha=1.0, beta=-1.0
+        )
+
+        # Compute residual norm
+        res_norm = compute_norm(ctx, Bref_potri)
+
+    print("\n" + "=" * 60)
+    print("Finalizing STF context...")
+    print("=" * 60)
+    ctx.finalize()
+
+    end_time = time.time()
+    elapsed_ms = (end_time - start_time) * 1000.0
+
+    # Compute FLOPS for POTRI
+    # POTRF: (1/3) * N^3
+    # TRTRI: (1/3) * N^3
+    # LAUUM: (1/3) * N^3
+    # Total: N^3
+    flops = float(N) ** 3
+    gflops = flops / (elapsed_ms / 1000.0) / 1e9
+
+    print("\n" + "=" * 60)
+    print("Results")
+    print("=" * 60)
+    print(f"[POTRI] Elapsed time: {elapsed_ms:.2f} ms")
+    print(f"[POTRI] Performance: {gflops:.2f} GFLOPS")
+
+    if check_result:
+        residual = res_norm / b_norm
+        print(f"\n[POTRI] ||A * (A^(-1) * B) - B||: {res_norm:.6e}")
+        print(f"[POTRI] ||B||: {b_norm:.6e}")
+        print(f"[POTRI] Residual (||A * (A^(-1) * B) - B||/||B||): {residual:.6e}")
+
+        if residual < 0.01:
+            print("\n✅ Algorithm converged successfully!")
+            return 0
+        else:
+            print(f"\n❌ Algorithm did not converge (residual {residual:.6e} >= 0.01)")
+            return 1
+
+    print("=" * 60)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/python/cuda_cccl/tests/stf/numba_decorator.py b/python/cuda_cccl/tests/stf/numba_decorator.py
new file mode 100644
index 00000000000..42c14899789
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/numba_decorator.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Numba-based @jit decorator for STF. Not shipped in the wheel.
+Requires numba-cuda. Import from tests.stf when running from source.
+"""
+
+from numba import cuda
+
+from cuda.stf import context, dep, exec_place
+
+
+class stf_kernel_decorator:
+    def __init__(self, pyfunc, jit_args, jit_kwargs):
+        self._pyfunc = pyfunc
+        self._jit_args = jit_args
+        self._jit_kwargs = jit_kwargs
+        self._compiled_kernel = None
+        self._launch_cfg = None
+
+    def __getitem__(self, cfg):
+        if not (isinstance(cfg, tuple) or isinstance(cfg, list)):
+            raise TypeError("use kernel[grid, block ([, exec_place, ctx])]")
+        n = len(cfg)
+        if n not in (2, 3, 4):
+            raise TypeError(
+                "use kernel[grid, block], kernel[grid, block, exec_place], or kernel[grid, block, exec_place, ctx]"
+            )
+
+        grid_dim = cfg[0]
+        block_dim = cfg[1]
+        ctx = None
+        exec_pl = None
+
+        if n >= 3:
+            exec_pl = cfg[2]
+
+        if n == 4:
+            ctx = cfg[3]
+
+        if exec_pl is not None and not isinstance(exec_pl, exec_place):
+            raise TypeError("3rd item must be an exec_place")
+
+        if ctx is not None and not isinstance(ctx, context):
+            raise TypeError("4th item must be an STF context (or None to infer)")
+
+        self._launch_cfg = (grid_dim, block_dim, ctx, exec_pl)
+        return self
+
+    def __call__(self, *args, **kwargs):
+        if self._launch_cfg is None:
+            raise RuntimeError(
+                "launch configuration missing – use kernel[grid, block, ctx](...)"
+            )
+
+        gridDim, blockDim, ctx, exec_pl = self._launch_cfg
+
+        dep_items = []
+        for i, a in enumerate(args):
+            if isinstance(a, dep):
+                if ctx is None:
+                    ld = a.get_ld()
+                    ctx = ld.borrow_ctx_handle()
+                dep_items.append((i, a))
+
+        task_args = [exec_pl] if exec_pl else []
+        task_args.extend(a for _, a in dep_items)
+
+        with ctx.task(*task_args) as t:
+            dev_args = list(args)
+            for dep_index, (pos, _) in enumerate(dep_items):
+                cai = t.get_arg_cai(dep_index)
+                dev_args[pos] = cuda.from_cuda_array_interface(
+                    cai.__cuda_array_interface__, owner=None, sync=False
+                )
+
+            if self._compiled_kernel is None:
+                self._compiled_kernel = cuda.jit(*self._jit_args, **self._jit_kwargs)(
+                    self._pyfunc
+                )
+
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            self._compiled_kernel[gridDim, blockDim, nb_stream](*dev_args, **kwargs)
+
+        return None
+
+
+def jit(*jit_args, **jit_kwargs):
+    if jit_args and callable(jit_args[0]):
+        pyfunc = jit_args[0]
+        return _build_kernel(pyfunc, (), jit_kwargs)
+
+    def _decorator(fn):
+        return _build_kernel(fn, jit_args, jit_kwargs)
+
+    return _decorator
+
+
+def _build_kernel(pyfunc, jit_args, jit_kwargs=None):
+    if jit_kwargs is None:
+        jit_kwargs = {}
+    return stf_kernel_decorator(pyfunc, jit_args, jit_kwargs)
+
+
+__all__ = ["jit", "stf_kernel_decorator"]
diff --git a/python/cuda_cccl/tests/stf/numba_helpers.py b/python/cuda_cccl/tests/stf/numba_helpers.py
new file mode 100644
index 00000000000..9156f73c095
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/numba_helpers.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+Numba helpers for cuda.stf tests. Not shipped in the wheel.
+Convert task.get_arg_cai() / task.args_cai() (stf_cai) to Numba device arrays.
+Requires numba-cuda. Import from tests.stf when running from source.
+"""
+
+from __future__ import annotations
+
+
+def get_arg_numba(task, index):
+    """Return one task argument as a Numba device array. task.get_arg_cai(index) returns stf_cai."""
+    from numba import cuda
+
+    return cuda.from_cuda_array_interface(
+        task.get_arg_cai(index), owner=None, sync=False
+    )
+
+
+def numba_arguments(task):
+    """
+    Return all task buffer arguments as Numba device arrays. Same shape as task.args_cai():
+    None, a single array, or a tuple of arrays.
+    """
+    from numba import cuda
+
+    out = task.args_cai()
+    if out is None:
+        return None
+    if isinstance(out, tuple):
+        return tuple(
+            cuda.from_cuda_array_interface(o, owner=None, sync=False) for o in out
+        )
+    return cuda.from_cuda_array_interface(out, owner=None, sync=False)
+
+
+__all__ = ["get_arg_numba", "numba_arguments"]
diff --git a/python/cuda_cccl/tests/stf/pytorch_task.py b/python/cuda_cccl/tests/stf/pytorch_task.py
new file mode 100644
index 00000000000..27651bc12b1
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/pytorch_task.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+"""
+PyTorch-integrated task context manager for STF tests/examples.
+Not shipped in the wheel. Uses task.args_cai() (CAI from cuda.stf), converts to
+torch.Tensor here so cuda.stf has no PyTorch dependency. Requires PyTorch.
+"""
+
+from __future__ import annotations
+
+
+def tensor_arg(task, index):
+    """Return one task argument as a torch.Tensor. task.get_arg_cai() returns an stf_cai (has __cuda_array_interface__)."""
+    import torch
+
+    return torch.as_tensor(task.get_arg_cai(index))
+
+
+def tensor_arguments(task):
+    """
+    Return all task buffer arguments as torch.Tensors. Same shape as task.args_cai():
+    None, a single tensor, or a tuple of tensors. task.args_cai() returns stf_cai object(s).
+    """
+    import torch
+
+    out = task.args_cai()
+    if out is None:
+        return None
+    if isinstance(out, tuple):
+        return tuple(torch.as_tensor(o) for o in out)
+    return torch.as_tensor(out)
+
+
+def pytorch_task(ctx, *args):
+    """
+    Context manager: ctx.task(*args) with PyTorch stream and tensor conversion.
+    Yields tensor(s) from task.args_cai() converted to torch, as a tuple.
+
+    Example
+    -------
+    >>> from tests.stf.pytorch_task import pytorch_task
+    >>> with pytorch_task(ctx, lX.read(), lY.rw()) as (x_tensor, y_tensor):
+    ...     y_tensor[:] = x_tensor * 2
+    """
+    try:
+        import torch.cuda as tc
+    except ImportError:
+        raise RuntimeError(
+            "pytorch_task requires PyTorch to be installed. "
+            "Install PyTorch or use ctx.task() for a raw task."
+        ) from None
+
+    t = ctx.task(*args)
+
+    class _PyTorchTaskContext:
+        _stream_ctx = None
+
+        def __enter__(self):
+            t.start()
+            self._stream_ctx = None
+            try:
+                stream = tc.ExternalStream(t.stream_ptr())
+                self._stream_ctx = tc.stream(stream)
+                self._stream_ctx.__enter__()
+            except Exception:
+                t.end()
+                raise
+            tensors = tensor_arguments(t)
+            if tensors is None:
+                return None
+            if isinstance(tensors, tuple):
+                return tensors
+            return (tensors,)
+
+        def __exit__(self, exc_type, exc_val, exc_tb):
+            try:
+                if self._stream_ctx is not None:
+                    self._stream_ctx.__exit__(exc_type, exc_val, exc_tb)
+            finally:
+                t.end()
+            return False
+
+    return _PyTorchTaskContext()
+
+
+__all__ = ["pytorch_task", "tensor_arg", "tensor_arguments"]
diff --git a/python/cuda_cccl/tests/stf/test_context.py b/python/cuda_cccl/tests/stf/test_context.py
new file mode 100644
index 00000000000..f815d267c5a
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_context.py
@@ -0,0 +1,75 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numpy as np
+
+import cuda.stf as stf
+
+
+def test_ctx():
+    ctx = stf.context()
+    del ctx
+
+
+def test_graph_ctx():
+    ctx = stf.context(use_graph=True)
+    ctx.finalize()
+
+
+def test_ctx2():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    t = ctx.task(lX.rw())
+    t.start()
+    t.end()
+
+    t2 = ctx.task(lX.read(), lY.rw())
+    t2.start()
+    t2.end()
+
+    t3 = ctx.task(lX.read(), lZ.rw())
+    t3.start()
+    t3.end()
+
+    t4 = ctx.task(lY.read(), lZ.rw())
+    t4.start()
+    t4.end()
+
+    del ctx
+
+
+def test_ctx3():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(lX.rw()):
+        pass
+
+    with ctx.task(lX.read(), lY.rw()):
+        pass
+
+    with ctx.task(lX.read(), lZ.rw()):
+        pass
+
+    with ctx.task(lY.read(), lZ.rw()):
+        pass
+
+    del ctx
+
+
+if __name__ == "__main__":
+    test_ctx3()
diff --git a/python/cuda_cccl/tests/stf/test_decorator.py b/python/cuda_cccl/tests/stf/test_decorator.py
new file mode 100644
index 00000000000..076cf26700d
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_decorator.py
@@ -0,0 +1,50 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numba
+import numpy as np
+import pytest
+from numba import cuda
+from numba_decorator import jit
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+@jit
+def axpy(a, x, y):
+    i = cuda.grid(1)
+    if i < x.size:
+        y[i] = a * x[i] + y[i]
+
+
+@jit
+def scale(a, x):
+    i = cuda.grid(1)
+    if i < x.size:
+        x[i] = a * x[i]
+
+
+@pytest.mark.parametrize("use_graph", [True, False])
+def test_decorator(use_graph):
+    X, Y, Z = (np.ones(16, np.float32) for _ in range(3))
+
+    ctx = stf.context(use_graph=use_graph)
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    scale[32, 64](2.0, lX.rw())
+    axpy[32, 64](2.0, lX.read(), lY.rw())
+    axpy[32, 64, stf.exec_place.device(0)](
+        2.0, lX.read(), lZ.rw()
+    )  # explicit exec place
+    axpy[32, 64](
+        2.0, lY.read(), lZ.rw(stf.data_place.device(0))
+    )  # per-dep placement override
+
+
+if __name__ == "__main__":
+    test_decorator(False)
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
new file mode 100644
index 00000000000..fc4f3f590e2
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import math
+from typing import Literal, Optional, Tuple
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+import torch.cuda as tc  # noqa: E402
+from pytorch_task import tensor_arguments  # noqa: E402
+
+import cuda.stf as stf  # noqa: E402
+
+try:
+    import matplotlib.pyplot as plt
+
+    has_matplotlib = True
+except ImportError:
+    has_matplotlib = False
+
+Plane = Literal["xy", "xz", "yz"]
+
+
+def show_slice(t3d, plane="xy", index=None):
+    """Display a 2D slice of a 3D tensor (requires matplotlib)."""
+    if not has_matplotlib:
+        return
+
+    # grab a 2D view
+    if plane == "xy":
+        idx = t3d.shape[2] // 2 if index is None else index
+        slice2d = t3d[:, :, idx]
+    elif plane == "xz":
+        idx = t3d.shape[1] // 2 if index is None else index
+        slice2d = t3d[:, idx, :]
+    elif plane == "yz":
+        idx = t3d.shape[0] // 2 if index is None else index
+        slice2d = t3d[idx, :, :]
+    else:
+        raise ValueError("plane must be 'xy', 'xz' or 'yz'")
+
+    # move to cpu numpy array
+    arr = slice2d.detach().cpu().numpy()
+
+    # imshow = "imshow" not "imread"
+    plt.imshow(
+        arr,
+        origin="lower",
+        cmap="seismic",
+        vmin=-1e-2,
+        vmax=1e-2,
+        #        norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0)
+        #         norm=LogNorm(vmin=1e-12, vmax=1e-6)
+    )
+    # plt.colorbar()
+    plt.show(block=False)
+    plt.pause(0.01)
+
+
+def test_fdtd_3d_pytorch(
+    size_x: int = 150,
+    size_y: int = 150,
+    size_z: int = 150,
+    timesteps: int = 10,
+    output_freq: int = 0,
+    dx: float = 0.01,
+    dy: float = 0.01,
+    dz: float = 0.01,
+    epsilon0: float = 8.85e-12,
+    mu0: float = 1.256e-6,
+    device: Optional[torch.device] = None,
+    dtype: torch.dtype = torch.float64,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
+    ctx = stf.context()
+
+    # allocate and initialize fields
+    shape = (size_x, size_y, size_z)
+
+    # Electric field components (initialized to zero)
+    lex = ctx.logical_data_zeros(shape, dtype=np.float64)
+    ley = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lez = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Magnetic field components (initialized to zero)
+    lhx = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhy = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhz = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Material properties
+    lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64)
+    lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64)
+
+    # CFL (same formula as example)
+    dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
+
+    # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1
+    i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1)
+    i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2)
+
+    # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1
+    i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1)
+    i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None)
+
+    # source location (single cell at center)
+    cx, cy, cz = size_x // 2, size_y // 10, size_z // 2
+
+    def source(t: float, x: float, y: float, z: float) -> float:
+        # sin(k*x - omega*t) with f = 1e9 Hz
+        pi = math.pi
+        freq = 1.0e9
+        omega = 2.0 * pi * freq
+        wavelength = 3.0e8 / freq
+        k = 2.0 * pi / wavelength
+        return math.sin(k * x - omega * t)
+
+    for n in range(int(timesteps)):
+        # -------------------------
+        # update electric fields (Es)
+        # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
+        with (
+            ctx.task(lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            ex, hy, hz, epsilon = tensor_arguments(t)
+            ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dx)
+            ) * (
+                (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
+                - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
+            )
+
+        # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
+        with (
+            ctx.task(ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            ey, hx, hz, epsilon = tensor_arguments(t)
+            ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dy)
+            ) * (
+                (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
+                - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
+            )
+
+        # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
+        with (
+            ctx.task(lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            ez, hx, hy, epsilon = tensor_arguments(t)
+            ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dz)
+            ) * (
+                (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
+                - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
+            )
+
+        # source at center cell
+        with (
+            ctx.task(lez.rw()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            ez = tensor_arguments(t)
+            ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+
+        # -------------------------
+        # update magnetic fields (Hs)
+        # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
+        with (
+            ctx.task(lhx.rw(), ley.read(), lez.read(), lmu.read()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            hx, ey, ez, mu = tensor_arguments(t)
+            hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dy)
+            ) * (
+                (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
+                - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
+            )
+
+        # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
+        with (
+            ctx.task(lhy.rw(), lex.read(), lez.read(), lmu.read()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            hy, ex, ez, mu = tensor_arguments(t)
+            hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dz)
+            ) * (
+                (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
+                - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
+            )
+
+        # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
+        with (
+            ctx.task(lhz.rw(), lex.read(), ley.read(), lmu.read()) as t,
+            tc.stream(tc.ExternalStream(t.stream_ptr())),
+        ):
+            hz, ex, ey, mu = tensor_arguments(t)
+            hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dx)
+            ) * (
+                (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
+                - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
+            )
+
+        if output_freq > 0 and (n % output_freq) == 0:
+            with (
+                ctx.task(lez.read()) as t,
+                tc.stream(tc.ExternalStream(t.stream_ptr())),
+            ):
+                ez = tensor_arguments(t)
+                print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+                if has_matplotlib:
+                    show_slice(ez, plane="xy")
+            pass
+
+    ctx.finalize()
+
+
+if __name__ == "__main__":
+    # Run FDTD simulation
+    output_freq = 5 if has_matplotlib else 0
+    if not has_matplotlib and output_freq > 0:
+        print("Warning: matplotlib not available, running without visualization")
+        output_freq = 0
+    test_fdtd_3d_pytorch(timesteps=1000, output_freq=output_freq)
diff --git a/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
new file mode 100644
index 00000000000..85a9f7f9c51
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fdtd_pytorch_simplified.py
@@ -0,0 +1,233 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import math
+from typing import Literal, Optional, Tuple
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+
+from pytorch_task import pytorch_task  # noqa: E402
+
+import cuda.stf as stf  # noqa: E402
+
+try:
+    import matplotlib.pyplot as plt
+
+    has_matplotlib = True
+except ImportError:
+    has_matplotlib = False
+
+Plane = Literal["xy", "xz", "yz"]
+
+
+def show_slice(t3d, plane="xy", index=None):
+    """Display a 2D slice of a 3D tensor (requires matplotlib)."""
+    if not has_matplotlib:
+        return
+
+    # grab a 2D view
+    if plane == "xy":
+        idx = t3d.shape[2] // 2 if index is None else index
+        slice2d = t3d[:, :, idx]
+    elif plane == "xz":
+        idx = t3d.shape[1] // 2 if index is None else index
+        slice2d = t3d[:, idx, :]
+    elif plane == "yz":
+        idx = t3d.shape[0] // 2 if index is None else index
+        slice2d = t3d[idx, :, :]
+    else:
+        raise ValueError("plane must be 'xy', 'xz' or 'yz'")
+
+    # move to cpu numpy array
+    arr = slice2d.detach().cpu().numpy()
+
+    # imshow = "imshow" not "imread"
+    plt.imshow(
+        arr,
+        origin="lower",
+        cmap="seismic",
+        vmin=-1e-2,
+        vmax=1e-2,
+        #        norm=SymLogNorm(linthresh=1e-8, vmin=-1e-0, vmax=1e-0)
+        #         norm=LogNorm(vmin=1e-12, vmax=1e-6)
+    )
+    # plt.colorbar()
+    plt.show(block=False)
+    plt.pause(0.01)
+
+
+def test_fdtd_3d_pytorch_simplified(
+    size_x: int = 150,
+    size_y: int = 150,
+    size_z: int = 150,
+    timesteps: int = 10,
+    output_freq: int = 0,
+    dx: float = 0.01,
+    dy: float = 0.01,
+    dz: float = 0.01,
+    epsilon0: float = 8.85e-12,
+    mu0: float = 1.256e-6,
+    device: Optional[torch.device] = None,
+    dtype: torch.dtype = torch.float64,
+) -> Tuple[
+    torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor
+]:
+    """
+    FDTD 3D implementation using pytorch_task for simplified syntax.
+    Demonstrates automatic stream and tensor management.
+    """
+    ctx = stf.context()
+
+    # allocate and initialize fields
+    shape = (size_x, size_y, size_z)
+
+    # Electric field components (initialized to zero)
+    lex = ctx.logical_data_zeros(shape, dtype=np.float64)
+    ley = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lez = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Magnetic field components (initialized to zero)
+    lhx = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhy = ctx.logical_data_zeros(shape, dtype=np.float64)
+    lhz = ctx.logical_data_zeros(shape, dtype=np.float64)
+
+    # Material properties
+    lepsilon = ctx.logical_data_full(shape, float(epsilon0), dtype=np.float64)
+    lmu = ctx.logical_data_full(shape, float(mu0), dtype=np.float64)
+
+    # CFL (same formula as example)
+    dt = 0.25 * min(dx, dy, dz) * math.sqrt(epsilon0 * mu0)
+
+    # Es (interior) = [1..N-2] along all dims -> enables i-1, j-1, k-1
+    i_es, j_es, k_es = slice(1, -1), slice(1, -1), slice(1, -1)
+    i_es_m, j_es_m, k_es_m = slice(0, -2), slice(0, -2), slice(0, -2)
+
+    # Hs (base) = [0..N-2] along all dims -> enables i+1, j+1, k+1
+    i_hs, j_hs, k_hs = slice(0, -1), slice(0, -1), slice(0, -1)
+    i_hs_p, j_hs_p, k_hs_p = slice(1, None), slice(1, None), slice(1, None)
+
+    # source location (single cell at center)
+    cx, cy, cz = size_x // 2, size_y // 10, size_z // 2
+
+    def source(t: float, x: float, y: float, z: float) -> float:
+        # sin(k*x - omega*t) with f = 1e9 Hz
+        pi = math.pi
+        freq = 1.0e9
+        omega = 2.0 * pi * freq
+        wavelength = 3.0e8 / freq
+        k = 2.0 * pi / wavelength
+        return math.sin(k * x - omega * t)
+
+    for n in range(int(timesteps)):
+        # -------------------------
+        # update electric fields (Es)
+        # Ex(i,j,k) += (dt/(ε*dx)) * [(Hz(i,j,k)-Hz(i,j-1,k)) - (Hy(i,j,k)-Hy(i,j,k-1))]
+        with pytorch_task(ctx, lex.rw(), lhy.read(), lhz.read(), lepsilon.read()) as (
+            ex,
+            hy,
+            hz,
+            epsilon,
+        ):
+            ex[i_es, j_es, k_es] = ex[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dx)
+            ) * (
+                (hz[i_es, j_es, k_es] - hz[i_es, j_es_m, k_es])
+                - (hy[i_es, j_es, k_es] - hy[i_es, j_es, k_es_m])
+            )
+
+        # Ey(i,j,k) += (dt/(ε*dy)) * [(Hx(i,j,k)-Hx(i,j,k-1)) - (Hz(i,j,k)-Hz(i-1,j,k))]
+        with pytorch_task(ctx, ley.rw(), lhx.read(), lhz.read(), lepsilon.read()) as (
+            ey,
+            hx,
+            hz,
+            epsilon,
+        ):
+            ey[i_es, j_es, k_es] = ey[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dy)
+            ) * (
+                (hx[i_es, j_es, k_es] - hx[i_es, j_es, k_es_m])
+                - (hz[i_es, j_es, k_es] - hz[i_es_m, j_es, k_es])
+            )
+
+        # Ez(i,j,k) += (dt/(ε*dz)) * [(Hy(i,j,k)-Hy(i-1,j,k)) - (Hx(i,j,k)-Hx(i,j-1,k))]
+        with pytorch_task(ctx, lez.rw(), lhx.read(), lhy.read(), lepsilon.read()) as (
+            ez,
+            hx,
+            hy,
+            epsilon,
+        ):
+            ez[i_es, j_es, k_es] = ez[i_es, j_es, k_es] + (
+                dt / (epsilon[i_es, j_es, k_es] * dz)
+            ) * (
+                (hy[i_es, j_es, k_es] - hy[i_es_m, j_es, k_es])
+                - (hx[i_es, j_es, k_es] - hx[i_es, j_es_m, k_es])
+            )
+
+        # source at center cell
+        with pytorch_task(ctx, lez.rw()) as (ez,):
+            ez[cx, cy, cz] = ez[cx, cy, cz] + source(n * dt, cx * dx, cy * dy, cz * dz)
+
+        # -------------------------
+        # update magnetic fields (Hs)
+        # Hx(i,j,k) -= (dt/(μ*dy)) * [(Ez(i,j+1,k)-Ez(i,j,k)) - (Ey(i,j,k+1)-Ey(i,j,k))]
+        with pytorch_task(ctx, lhx.rw(), ley.read(), lez.read(), lmu.read()) as (
+            hx,
+            ey,
+            ez,
+            mu,
+        ):
+            hx[i_hs, j_hs, k_hs] = hx[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dy)
+            ) * (
+                (ez[i_hs, j_hs_p, k_hs] - ez[i_hs, j_hs, k_hs])
+                - (ey[i_hs, j_hs, k_hs_p] - ey[i_hs, j_hs, k_hs])
+            )
+
+        # Hy(i,j,k) -= (dt/(μ*dz)) * [(Ex(i,j,k+1)-Ex(i,j,k)) - (Ez(i+1,j,k)-Ez(i,j,k))]
+        with pytorch_task(ctx, lhy.rw(), lex.read(), lez.read(), lmu.read()) as (
+            hy,
+            ex,
+            ez,
+            mu,
+        ):
+            hy[i_hs, j_hs, k_hs] = hy[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dz)
+            ) * (
+                (ex[i_hs, j_hs, k_hs_p] - ex[i_hs, j_hs, k_hs])
+                - (ez[i_hs_p, j_hs, k_hs] - ez[i_hs, j_hs, k_hs])
+            )
+
+        # Hz(i,j,k) -= (dt/(μ*dx)) * [(Ey(i+1,j,k)-Ey(i,j,k)) - (Ex(i,j+1,k)-Ex(i,j,k))]
+        with pytorch_task(ctx, lhz.rw(), lex.read(), ley.read(), lmu.read()) as (
+            hz,
+            ex,
+            ey,
+            mu,
+        ):
+            hz[i_hs, j_hs, k_hs] = hz[i_hs, j_hs, k_hs] - (
+                dt / (mu[i_hs, j_hs, k_hs] * dx)
+            ) * (
+                (ey[i_hs_p, j_hs, k_hs] - ey[i_hs, j_hs, k_hs])
+                - (ex[i_hs, j_hs_p, k_hs] - ex[i_hs, j_hs, k_hs])
+            )
+
+        if output_freq > 0 and (n % output_freq) == 0:
+            with pytorch_task(ctx, lez.read()) as (ez,):
+                print(f"{n}\t{ez[cx, cy, cz].item():.6e}")
+                if has_matplotlib:
+                    show_slice(ez, plane="xy")
+
+    ctx.finalize()
+
+
+if __name__ == "__main__":
+    # Run simplified FDTD simulation using pytorch_task
+    output_freq = 5 if has_matplotlib else 0
+    if not has_matplotlib and output_freq > 0:
+        print("Warning: matplotlib not available, running without visualization")
+        output_freq = 0
+    test_fdtd_3d_pytorch_simplified(timesteps=1000, output_freq=output_freq)
diff --git a/python/cuda_cccl/tests/stf/test_fhe.py b/python/cuda_cccl/tests/stf/test_fhe.py
new file mode 100644
index 00000000000..7d58c02fbeb
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fhe.py
@@ -0,0 +1,143 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Toy Fully Homomorphic Encryption (FHE) example with addition encryption
+
+import numba
+from numba import cuda
+from numba_helpers import numba_arguments
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+class Plaintext:
+    def __init__(self, ctx, values=None, ld=None, key=0x42, name=None):
+        self.ctx = ctx
+        self.key = key
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values, name=name)
+
+    def encrypt(self) -> "Ciphertext":
+        encrypted = bytearray([(c + self.key) & 0xFF for c in self.values])
+        return Ciphertext(self.ctx, values=encrypted, key=self.key)
+
+    def print_values(self):
+        with self.ctx.task(
+            stf.exec_place.host(), self.l.read(stf.data_place.managed())
+        ) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            nb_stream.synchronize()
+            hvalues = numba_arguments(t)
+            print([v for v in hvalues])
+
+
+@cuda.jit
+def add_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = (a[i] + b[i]) & 0xFF
+
+
+@cuda.jit
+def sub_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = (a[i] - b[i]) & 0xFF
+
+
+@cuda.jit
+def sub_scalar_kernel(a, out, v):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = (a[i] - v) & 0xFF
+
+
+class Ciphertext:
+    def __init__(self, ctx, values=None, ld=None, key=0x42, name=None):
+        self.ctx = ctx
+        self.key = key
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values, name=name)
+
+    def __add__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+        result = self.empty_like()
+        with self.ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da, db, dresult = numba_arguments(t)
+            add_kernel[32, 16, nb_stream](da, db, dresult)
+        return result
+
+    def __sub__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+        result = self.empty_like()
+        with self.ctx.task(self.l.read(), other.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da, db, dresult = numba_arguments(t)
+            sub_kernel[32, 16, nb_stream](da, db, dresult)
+        return result
+
+    def decrypt(self, num_operands=2):
+        """Decrypt by subtracting num_operands * key"""
+        result = self.empty_like()
+        total_key = (num_operands * self.key) & 0xFF
+        with self.ctx.task(self.l.read(), result.l.write()) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            da, dresult = numba_arguments(t)
+            sub_scalar_kernel[32, 16, nb_stream](da, dresult, total_key)
+        return Plaintext(self.ctx, ld=result.l, key=self.key)
+
+    def empty_like(self):
+        return Ciphertext(self.ctx, ld=self.l.empty_like())
+
+
+def circuit(a, b):
+    """Circuit: (A + B) + (B - A) = 2*B"""
+    return (a + b) + (b - a)
+
+
+def test_fhe():
+    """Test FHE using manual task creation with addition encryption."""
+    ctx = stf.context(use_graph=False)
+
+    vA = [3, 3, 2, 2, 17]
+    pA = Plaintext(ctx, vA, name="A")
+
+    vB = [1, 7, 7, 7, 49]
+    pB = Plaintext(ctx, vB, name="B")
+
+    expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)]
+
+    eA = pA.encrypt()
+    eB = pB.encrypt()
+    encrypted_out = circuit(eA, eB)
+    decrypted_out = encrypted_out.decrypt(num_operands=2)
+
+    with ctx.task(
+        stf.exec_place.host(), decrypted_out.l.read(stf.data_place.managed())
+    ) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        nb_stream.synchronize()
+        hvalues = numba_arguments(t)
+        actual = [int(v) for v in hvalues]
+
+    ctx.finalize()
+
+    assert actual == expected, (
+        f"Decrypted result {actual} doesn't match expected {expected}"
+    )
+
+
+if __name__ == "__main__":
+    test_fhe()
diff --git a/python/cuda_cccl/tests/stf/test_fhe_decorator.py b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
new file mode 100644
index 00000000000..0d1eb64d8bb
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_fhe_decorator.py
@@ -0,0 +1,135 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+# Toy Fully Homomorphic Encryption (FHE) example with addition encryption
+
+import numba
+from numba import cuda
+from numba_decorator import jit
+from numba_helpers import numba_arguments
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+class Plaintext:
+    def __init__(self, ctx, values=None, ld=None, key=0x42, name=None):
+        self.ctx = ctx
+        self.key = key
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values, name=name)
+
+    def encrypt(self) -> "Ciphertext":
+        encrypted = bytearray([(c + self.key) & 0xFF for c in self.values])
+        return Ciphertext(self.ctx, values=encrypted, key=self.key)
+
+    def print_values(self):
+        with self.ctx.task(
+            stf.exec_place.host(), self.l.read(stf.data_place.managed())
+        ) as t:
+            nb_stream = cuda.external_stream(t.stream_ptr())
+            nb_stream.synchronize()
+            hvalues = numba_arguments(t)
+            print([v for v in hvalues])
+
+
+@jit
+def add_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = (a[i] + b[i]) & 0xFF
+
+
+@jit
+def sub_kernel(a, b, out):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = (a[i] - b[i]) & 0xFF
+
+
+@jit
+def sub_scalar_kernel(a, out, v):
+    i = cuda.grid(1)
+    if i < out.size:
+        out[i] = (a[i] - v) & 0xFF
+
+
+class Ciphertext:
+    def __init__(self, ctx, values=None, ld=None, key=0x42, name=None):
+        self.ctx = ctx
+        self.key = key
+        if ld is not None:
+            self.l = ld
+        if values is not None:
+            self.values = bytearray(values)
+            self.l = ctx.logical_data(self.values, name=name)
+
+    def __add__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+        result = self.empty_like()
+        add_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
+        return result
+
+    def __sub__(self, other):
+        if not isinstance(other, Ciphertext):
+            return NotImplemented
+        result = self.empty_like()
+        sub_kernel[32, 16](self.l.read(), other.l.read(), result.l.write())
+        return result
+
+    def decrypt(self, num_operands=2):
+        """Decrypt by subtracting num_operands * key"""
+        result = self.empty_like()
+        total_key = (num_operands * self.key) & 0xFF
+        sub_scalar_kernel[32, 16](self.l.read(), result.l.write(), total_key)
+        return Plaintext(self.ctx, ld=result.l, key=self.key)
+
+    def empty_like(self):
+        return Ciphertext(self.ctx, ld=self.l.empty_like())
+
+
+def circuit(a, b):
+    """Circuit: (A + B) + (B - A) = 2*B"""
+    return (a + b) + (b - a)
+
+
+def test_fhe_decorator():
+    """Test FHE using @jit (numba_decorator) with addition encryption."""
+    ctx = stf.context(use_graph=False)
+
+    vA = [3, 3, 2, 2, 17]
+    pA = Plaintext(ctx, vA, name="A")
+
+    vB = [1, 7, 7, 7, 49]
+    pB = Plaintext(ctx, vB, name="B")
+
+    expected = [circuit(a, b) & 0xFF for a, b in zip(vA, vB)]
+
+    eA = pA.encrypt()
+    eB = pB.encrypt()
+    encrypted_out = circuit(eA, eB)
+    decrypted_out = encrypted_out.decrypt(num_operands=2)
+
+    with ctx.task(
+        stf.exec_place.host(), decrypted_out.l.read(stf.data_place.managed())
+    ) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        nb_stream.synchronize()
+        hvalues = numba_arguments(t)
+        actual = [int(v) for v in hvalues]
+
+    ctx.finalize()
+
+    assert actual == expected, (
+        f"Decrypted result {actual} doesn't match expected {expected}"
+    )
+
+
+if __name__ == "__main__":
+    test_fhe_decorator()
diff --git a/python/cuda_cccl/tests/stf/test_numba.py b/python/cuda_cccl/tests/stf/test_numba.py
new file mode 100644
index 00000000000..daac131c54b
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_numba.py
@@ -0,0 +1,254 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+import numba
+import numpy as np
+import pytest
+from numba import cuda
+from numba_helpers import get_arg_numba, numba_arguments
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+@cuda.jit
+def axpy(a, x, y):
+    i = cuda.grid(1)
+    if i < x.size:
+        y[i] = a * x[i] + y[i]
+
+
+@cuda.jit
+def scale(a, x):
+    i = cuda.grid(1)
+    if i < x.size:
+        x[i] = a * x[i]
+
+
+# One test with a single kernel in a CUDA graph
+def test_numba_graph():
+    X = np.ones(16, dtype=np.float32)
+    ctx = stf.context(use_graph=True)
+    lX = ctx.logical_data(X)
+    with ctx.task(lX.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = numba_arguments(t)
+        scale[32, 64, nb_stream](2.0, dX)
+
+    ctx.finalize()
+
+    # Verify results after finalize (data written back to host)
+    # Expected: scale(2.0, 1.0) = 2.0
+    assert np.allclose(X, 2.0)
+
+
+def test_numba():
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+    Z = np.ones(n, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    threads_per_block = 256
+    blocks = (n + threads_per_block - 1) // threads_per_block
+
+    with ctx.task(lX.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = numba_arguments(t)
+        scale[blocks, threads_per_block, nb_stream](2.0, dX)
+
+    with ctx.task(lX.read(), lY.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        dY = get_arg_numba(t, 1)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
+
+    with ctx.task(lX.read(), lZ.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX, dZ = numba_arguments(t)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dZ)
+
+    with ctx.task(lY.read(), lZ.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dY, dZ = numba_arguments(t)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dY, dZ)
+
+    ctx.finalize()
+
+    # Verify results after finalize (data written back to host)
+    # Expected values:
+    # X: scale(2.0, 1.0) = 2.0
+    # Y: axpy(2.0, X=2.0, Y=1.0) = 2.0*2.0 + 1.0 = 5.0
+    # Z: axpy(2.0, X=2.0, Z=1.0) = 5.0, then axpy(2.0, Y=5.0, Z=5.0) = 15.0
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 5.0)
+    assert np.allclose(Z, 15.0)
+
+
+@cuda.jit
+def laplacian_5pt_kernel(u_in, u_out, dx, dy):
+    """
+    Compute a 5-point Laplacian on u_in and write the result to u_out.
+
+    Grid-stride 2-D kernel.  Assumes C-contiguous (row-major) inputs.
+    Boundary cells are copied unchanged.
+    """
+    coef_x = 1.0 / (dx * dx)
+    coef_y = 1.0 / (dy * dy)
+
+    i, j = cuda.grid(2)  # i <-> row (x-index), j <-> col (y-index)
+    nx, ny = u_in.shape
+
+    if i >= nx or j >= ny:
+        return  # out-of-bounds threads do nothing
+
+    if 0 < i < nx - 1 and 0 < j < ny - 1:
+        u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + (
+            u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]
+        ) * coef_y
+    else:
+        # simple Dirichlet/Neumann placeholder: copy input to output
+        u_out[i, j] = u_in[i, j]
+
+
+def test_numba2d():
+    nx, ny = 1024, 1024
+    dx = 2.0 * np.pi / (nx - 1)
+    dy = 2.0 * np.pi / (ny - 1)
+
+    # a smooth test field: f(x,y) = sin(x) * cos(y)
+    x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64)
+    y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64)
+
+    u = np.sin(x)[:, None] * np.cos(y)[None, :]  # shape = (nx, ny)
+    u_out = np.zeros_like(u)
+
+    ctx = stf.context()
+    lu = ctx.logical_data(u)
+    lu_out = ctx.logical_data(u_out)
+
+    with ctx.task(lu.read(), lu_out.write()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        du = get_arg_numba(t, 0)
+        du_out = get_arg_numba(t, 1)
+        threads_per_block = (16, 16)  # 256 threads per block is a solid starting point
+        blocks_per_grid = (
+            (nx + threads_per_block[0] - 1) // threads_per_block[0],
+            (ny + threads_per_block[1] - 1) // threads_per_block[1],
+        )
+        laplacian_5pt_kernel[blocks_per_grid, threads_per_block, nb_stream](
+            du, du_out, dx, dy
+        )
+
+    ctx.finalize()
+
+    u_out_ref = np.zeros_like(u)
+
+    for i in range(1, nx - 1):  # skip boundaries
+        for j in range(1, ny - 1):
+            u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + (
+                u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]
+            ) / dy**2
+
+    # copy boundaries
+    u_out_ref[0, :] = u[0, :]
+    u_out_ref[-1, :] = u[-1, :]
+    u_out_ref[:, 0] = u[:, 0]
+    u_out_ref[:, -1] = u[:, -1]
+
+    # compare with the GPU result
+    assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6)
+
+
+def test_numba_exec_place():
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(stf.exec_place.device(0), lX.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        scale[32, 64, nb_stream](2.0, dX)
+
+    with ctx.task(stf.exec_place.device(0), lX.read(), lY.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        dY = get_arg_numba(t, 1)
+        axpy[32, 64, nb_stream](2.0, dX, dY)
+
+    with ctx.task(
+        stf.exec_place.device(0),
+        lX.read(stf.data_place.managed()),
+        lZ.rw(stf.data_place.managed()),
+    ) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        dZ = get_arg_numba(t, 1)
+        axpy[32, 64, nb_stream](2.0, dX, dZ)
+
+    with ctx.task(stf.exec_place.device(0), lY.read(), lZ.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dY = get_arg_numba(t, 0)
+        dZ = get_arg_numba(t, 1)
+        axpy[32, 64, nb_stream](2.0, dY, dZ)
+
+
+def test_numba_places():
+    if len(list(cuda.gpus)) < 2:
+        pytest.skip("Need at least 2 GPUs")
+        return
+
+    X = np.ones(16, dtype=np.float32)
+    Y = np.ones(16, dtype=np.float32)
+    Z = np.ones(16, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(lX.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = numba_arguments(t)
+        scale[32, 64, nb_stream](2.0, dX)
+
+    with ctx.task(lX.read(), lY.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        dY = get_arg_numba(t, 1)
+        axpy[32, 64, nb_stream](2.0, dX, dY)
+
+    with ctx.task(stf.exec_place.device(1), lX.read(), lZ.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        dZ = get_arg_numba(t, 1)
+        axpy[32, 64, nb_stream](2.0, dX, dZ)
+
+    with ctx.task(lY.read(), lZ.rw(stf.data_place.device(1))) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dY = get_arg_numba(t, 0)
+        dZ = get_arg_numba(t, 1)
+        axpy[32, 64, nb_stream](2.0, dY, dZ)
+
+    ctx.finalize()
+    # Same expected values as test_numba (X=2, Y=5, Z=15) with multi-GPU placement
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 5.0)
+    assert np.allclose(Z, 15.0)
+
+
+if __name__ == "__main__":
+    test_numba_graph()
+    # test_numba()
diff --git a/python/cuda_cccl/tests/stf/test_pytorch.py b/python/cuda_cccl/tests/stf/test_pytorch.py
new file mode 100644
index 00000000000..ac03c27e00d
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_pytorch.py
@@ -0,0 +1,118 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+import numpy as np
+import pytest
+
+torch = pytest.importorskip("torch")
+
+from pytorch_task import (  # noqa: E402
+    pytorch_task,
+    tensor_arg,
+    tensor_arguments,
+)
+
+import cuda.stf as stf  # noqa: E402
+
+
+def test_pytorch():
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+    Z = np.ones(n, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    with ctx.task(lX.rw()) as t:
+        torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
+        with torch.cuda.stream(torch_stream):
+            tX = tensor_arguments(t)
+            tX[:] = tX * 2  # In-place multiplication
+
+    with ctx.task(lX.read(), lY.write()) as t:
+        torch_stream = torch.cuda.ExternalStream(t.stream_ptr())
+        with torch.cuda.stream(torch_stream):
+            tX = tensor_arg(t, 0)
+            tY = tensor_arg(t, 1)
+            tY[:] = tX * 2  # Copy result into tY tensor
+
+    with (
+        ctx.task(lX.read(), lZ.write()) as t,
+        torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+    ):
+        tX, tZ = tensor_arguments(t)
+        tZ[:] = tX * 4 + 1  # Copy result into tZ tensor
+
+    with (
+        ctx.task(lY.read(), lZ.rw()) as t,
+        torch.cuda.stream(torch.cuda.ExternalStream(t.stream_ptr())),
+    ):
+        tY, tZ = tensor_arguments(t)
+        tZ[:] = tY * 2 - 3  # Copy result into tZ tensor
+
+    ctx.finalize()
+
+    # Verify results on host after finalize
+    # Expected values:
+    # X: 1.0 -> 2.0 (multiplied by 2)
+    # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0)
+    # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0)
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 4.0)
+    assert np.allclose(Z, 5.0)
+
+
+def test_pytorch_task():
+    """Test the pytorch_task functionality with simplified syntax"""
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+    Z = np.ones(n, dtype=np.float32)
+
+    ctx = stf.context()
+
+    # Note: We could use ctx.logical_data_full instead of creating NumPy arrays first
+    # For example: lX = ctx.logical_data_full((n,), 1.0, dtype=np.float32)
+    # However, this would create logical data without underlying NumPy arrays,
+    # so we wouldn't be able to check results after ctx.finalize() in this test
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    lZ = ctx.logical_data(Z)
+
+    # Equivalent operations to test_pytorch() but using pytorch_task syntax
+
+    # In-place multiplication using pytorch_task (single tensor)
+    with pytorch_task(ctx, lX.rw()) as (tX,):
+        tX[:] = tX * 2
+
+    # Copy and multiply using pytorch_task (multiple tensors)
+    with pytorch_task(ctx, lX.read(), lY.write()) as (tX, tY):
+        tY[:] = tX * 2
+
+    # Another operation combining tensors
+    with pytorch_task(ctx, lX.read(), lZ.write()) as (tX, tZ):
+        tZ[:] = tX * 4 + 1
+
+    # Final operation with read-write access
+    with pytorch_task(ctx, lY.read(), lZ.rw()) as (tY, tZ):
+        tZ[:] = tY * 2 - 3
+
+    ctx.finalize()
+
+    # Verify results on host after finalize (same as original test)
+    # Expected values:
+    # X: 1.0 -> 2.0 (multiplied by 2)
+    # Y: 1.0 -> 4.0 (X * 2 = 2.0 * 2 = 4.0)
+    # Z: 1.0 -> 9.0 (X * 4 + 1 = 2.0 * 4 + 1 = 9.0) -> 5.0 (Y * 2 - 3 = 4.0 * 2 - 3 = 5.0)
+    assert np.allclose(X, 2.0)
+    assert np.allclose(Y, 4.0)
+    assert np.allclose(Z, 5.0)
+
+
+if __name__ == "__main__":
+    test_pytorch()
diff --git a/python/cuda_cccl/tests/stf/test_stencil_decorator.py b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
new file mode 100644
index 00000000000..d43c4e99e6e
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_stencil_decorator.py
@@ -0,0 +1,88 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numba
+import numpy as np
+from numba import cuda
+from numba_decorator import jit
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+@jit
+def laplacian_5pt_kernel(u_in, u_out, dx, dy):
+    """
+    Compute a 5-point Laplacian on u_in and write the result to u_out.
+
+    Grid-stride 2-D kernel. Assumes C-contiguous (row-major) inputs.
+    Boundary cells are copied unchanged.
+    """
+    coef_x = 1.0 / (dx * dx)
+    coef_y = 1.0 / (dy * dy)
+
+    i, j = cuda.grid(2)  # i <-> row (x-index), j <-> col (y-index)
+    nx, ny = u_in.shape
+
+    if i >= nx or j >= ny:
+        return  # out-of-bounds threads do nothing
+
+    if 0 < i < nx - 1 and 0 < j < ny - 1:
+        u_out[i, j] = (u_in[i - 1, j] - 2.0 * u_in[i, j] + u_in[i + 1, j]) * coef_x + (
+            u_in[i, j - 1] - 2.0 * u_in[i, j] + u_in[i, j + 1]
+        ) * coef_y
+    else:
+        # simple Dirichlet/Neumann placeholder: copy input to output
+        u_out[i, j] = u_in[i, j]
+
+
+def test_numba2d():
+    nx, ny = 1024, 1024
+    dx = 2.0 * np.pi / (nx - 1)
+    dy = 2.0 * np.pi / (ny - 1)
+
+    # a smooth test field: f(x,y) = sin(x) * cos(y)
+    x = np.linspace(0, 2 * np.pi, nx, dtype=np.float64)
+    y = np.linspace(0, 2 * np.pi, ny, dtype=np.float64)
+
+    u = np.sin(x)[:, None] * np.cos(y)[None, :]  # shape = (nx, ny)
+    u_out = np.zeros_like(u)
+
+    ctx = stf.context()
+    lu = ctx.logical_data(u)
+    lu_out = ctx.logical_data(u_out)
+
+    threads_per_block = (16, 16)  # 256 threads per block is a solid starting point
+    blocks_per_grid = (
+        (nx + threads_per_block[0] - 1) // threads_per_block[0],
+        (ny + threads_per_block[1] - 1) // threads_per_block[1],
+    )
+
+    laplacian_5pt_kernel[blocks_per_grid, threads_per_block](
+        lu.read(), lu_out.write(), dx, dy
+    )
+
+    ctx.finalize()
+
+    u_out_ref = np.zeros_like(u)
+
+    for i in range(1, nx - 1):  # skip boundaries
+        for j in range(1, ny - 1):
+            u_out_ref[i, j] = (u[i - 1, j] - 2.0 * u[i, j] + u[i + 1, j]) / dx**2 + (
+                u[i, j - 1] - 2.0 * u[i, j] + u[i, j + 1]
+            ) / dy**2
+
+    # copy boundaries
+    u_out_ref[0, :] = u[0, :]
+    u_out_ref[-1, :] = u[-1, :]
+    u_out_ref[:, 0] = u[:, 0]
+    u_out_ref[:, -1] = u[:, -1]
+
+    # compare with the GPU result
+    assert np.allclose(u_out, u_out_ref, rtol=1e-6, atol=1e-6)
+
+
+if __name__ == "__main__":
+    test_numba2d()
diff --git a/python/cuda_cccl/tests/stf/test_token.py b/python/cuda_cccl/tests/stf/test_token.py
new file mode 100644
index 00000000000..e203c0357a7
--- /dev/null
+++ b/python/cuda_cccl/tests/stf/test_token.py
@@ -0,0 +1,81 @@
+# Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import numba
+import numpy as np
+from numba import cuda
+from numba_helpers import get_arg_numba, numba_arguments
+
+import cuda.stf as stf
+
+numba.cuda.config.CUDA_LOW_OCCUPANCY_WARNINGS = 0
+
+
+def test_token():
+    ctx = stf.context()
+    lX = ctx.token()
+    lY = ctx.token()
+    lZ = ctx.token()
+
+    with ctx.task(lX.rw()):
+        pass
+
+    with ctx.task(lX.read(), lY.rw()):
+        pass
+
+    with ctx.task(lX.read(), lZ.rw()):
+        pass
+
+    with ctx.task(lY.read(), lZ.rw()):
+        pass
+
+    ctx.finalize()
+
+
+@cuda.jit
+def axpy(a, x, y):
+    start = cuda.grid(1)
+    stride = cuda.gridsize(1)
+    for i in range(start, x.size, stride):
+        y[i] = a * x[i] + y[i]
+
+
+def test_numba_token():
+    n = 1024 * 1024
+    X = np.ones(n, dtype=np.float32)
+    Y = np.ones(n, dtype=np.float32)
+
+    ctx = stf.context()
+    lX = ctx.logical_data(X)
+    lY = ctx.logical_data(Y)
+    token = ctx.token()
+
+    # Use a reasonable grid size - kernel loop will handle all elements
+    blocks = 32
+    threads_per_block = 256
+
+    with ctx.task(lX.read(), lY.rw(), token.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX = get_arg_numba(t, 0)
+        dY = get_arg_numba(t, 1)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
+
+    with ctx.task(lX.read(), lY.rw(), token.rw()) as t:
+        nb_stream = cuda.external_stream(t.stream_ptr())
+        dX, dY = numba_arguments(t)
+        axpy[blocks, threads_per_block, nb_stream](2.0, dX, dY)
+
+    ctx.finalize()
+
+    # Sanity checks: verify the results after finalize
+    # First task: Y = 2.0 * X + Y = 2.0 * 1.0 + 1.0 = 3.0
+    # Second task: Y = 2.0 * X + Y = 2.0 * 1.0 + 3.0 = 5.0
+    assert np.allclose(X, 1.0), f"X should still be 1.0 (read-only), but got {X[0]}"
+    assert np.allclose(Y, 5.0), (
+        f"Y should be 5.0 after two axpy operations, but got {Y[0]}"
+    )
+
+
+if __name__ == "__main__":
+    test_token()