From 841414e45daa9d108032f9dfa143dad347c692c7 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Apr 2026 01:44:22 +0000 Subject: [PATCH 01/24] Add RMM User Guide (draft from docs-overhaul branch) --- docs/conf.py | 3 +- docs/index.md | 2 +- docs/python/pylibrmm.md | 9 + docs/user_guide/choosing_memory_resources.md | 301 ++++++++ docs/user_guide/guide.md | 725 ++++++++++++------- docs/user_guide/index.md | 14 + docs/user_guide/installation.md | 181 +++++ docs/user_guide/introduction.md | 152 ++++ docs/user_guide/logging.md | 575 +++++++++++++++ docs/user_guide/managed_memory.md | 331 +++++++++ docs/user_guide/pool_allocators.md | 455 ++++++++++++ docs/user_guide/stream_ordered_allocation.md | 325 +++++++++ 12 files changed, 2811 insertions(+), 262 deletions(-) create mode 100644 docs/user_guide/choosing_memory_resources.md create mode 100644 docs/user_guide/index.md create mode 100644 docs/user_guide/installation.md create mode 100644 docs/user_guide/introduction.md create mode 100644 docs/user_guide/logging.md create mode 100644 docs/user_guide/managed_memory.md create mode 100644 docs/user_guide/pool_allocators.md create mode 100644 docs/user_guide/stream_ordered_allocation.md diff --git a/docs/conf.py b/docs/conf.py index db45dff09..c090697b8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # Configuration file for the Sphinx documentation builder. @@ -57,6 +57,7 @@ "sphinx.ext.intersphinx", "sphinx_copybutton", "sphinx_markdown_tables", + "sphinx_tabs.tabs", "sphinxcontrib.jquery", ] diff --git a/docs/index.md b/docs/index.md index d95428e72..21c8db3e7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ RMM (RAPIDS Memory Manager) is a library for allocating and managing GPU memory :maxdepth: 2 :caption: Contents -user_guide/guide +user_guide/index cpp/index python/index ``` diff --git a/docs/python/pylibrmm.md b/docs/python/pylibrmm.md index 0400ac077..216680fb1 100644 --- a/docs/python/pylibrmm.md +++ b/docs/python/pylibrmm.md @@ -23,3 +23,12 @@ The stream classes are available only through `rmm.pylibrmm` and provide low-lev :undoc-members: :show-inheritance: ``` + +### rmm.pylibrmm.cuda_stream + +```{eval-rst} +.. automodule:: rmm.pylibrmm.cuda_stream + :members: + :undoc-members: + :show-inheritance: +``` diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md new file mode 100644 index 000000000..55dec2064 --- /dev/null +++ b/docs/user_guide/choosing_memory_resources.md @@ -0,0 +1,301 @@ +# Choosing a Memory Resource + +One of the most common questions when using RMM is: "Which memory resource should I use?" + +This guide provides recommendations for selecting the appropriate memory resource based on your application's needs. + +## Recommended Defaults + +For most applications, use the CUDA async memory pool. + +`````{tabs} +````{code-tab} c++ +#include +#include + +rmm::mr::cuda_async_memory_resource mr; +rmm::mr::set_current_device_resource_ref(mr); +```` +````{code-tab} python +import rmm + +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) +```` +````` + +For applications exceeding GPU memory limits, use a pooled managed memory resource with prefetching. Note: managed memory is not supported on WSL2 systems. + +`````{tabs} +````{code-tab} c++ +#include +#include +#include +#include +#include + +// Use 80% of GPU memory, rounded down to nearest 256 bytes +auto [free_memory, total_memory] = rmm::available_device_memory(); +std::size_t pool_size = (static_cast(total_memory * 0.8) / 256) * 256; + +rmm::mr::managed_memory_resource managed_mr; +rmm::mr::pool_memory_resource pool_mr{managed_mr, pool_size}; +rmm::mr::prefetch_resource_adaptor prefetch_mr{pool_mr}; +rmm::mr::set_current_device_resource_ref(prefetch_mr); +```` +````{code-tab} python +import rmm + +# Use 80% of GPU memory, rounded down to nearest 256 bytes +free_memory, total_memory = rmm.mr.available_device_memory() +pool_size = int(total_memory * 0.8) // 256 * 256 + +mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=pool_size, + ) +) +rmm.mr.set_current_device_resource(mr) +```` +````` + +## Memory Resource Considerations + +It is usually best to use resources that allow the CUDA driver to manage pool suballocation via `cudaMallocFromPoolAsync`. + +### CudaAsyncMemoryResource + +The `CudaAsyncMemoryResource` uses CUDA's driver-managed memory pool (via `cudaMallocAsync`). This is the **recommended default** for most applications. + +**Advantages:** +- **Driver-managed pool**: Uses efficient suballocation with virtual addressing to avoid fragmentation +- **Cross-library sharing**: The pool can be shared across multiple applications and libraries, even those not using RMM directly +- **Stream-ordered semantics**: Allocations and deallocations are stream-ordered by default +- **Performance**: Similar or better performance compared to RMM's pool implementations + +**When to use:** +- Default choice for GPU-accelerated applications +- Multi-stream or multi-threaded applications +- Applications using multiple GPU libraries (e.g., cuDF + PyTorch) +- Most production workloads + +### CudaMemoryResource + +The `CudaMemoryResource` uses `cudaMalloc` directly for each allocation, with no pooling. + +**Advantages:** +- Simple and predictable +- No fragmentation concerns +- Memory is immediately returned to the system on deallocation + +**Disadvantages:** +- Slower than pooled allocators due to synchronization overhead + +**Example:** +```python +import rmm + +rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) +``` + +**When to use:** +- Simple applications with infrequent allocations +- Debugging memory issues +- Testing or benchmarking baseline performance + +### PoolMemoryResource + +The `PoolMemoryResource` maintains a pool of memory allocated from an upstream resource. + +**Advantages:** +- Fast suballocation from pre-allocated pool +- Configurable initial and maximum pool sizes + +**Disadvantages:** +- Can suffer from fragmentation (unlike async MR) +- Pool is not shared across applications +- Requires careful tuning of pool sizes + +**Example:** +```python +import rmm + +pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), # upstream resource + initial_pool_size=2**30, # 1 GiB + maximum_pool_size=2**32 # 4 GiB +) +rmm.mr.set_current_device_resource(pool) +``` + +**When to use:** +- Legacy applications (prefer `CudaAsyncMemoryResource` for new code) +- Specific tuning requirements not met by async MR +- Wrapping non-CUDA memory sources + +**Important**: If using `PoolMemoryResource`, prefer wrapping `CudaAsyncMemoryResource` as the upstream rather than `CudaMemoryResource`: + +```python +# Better: Pool wrapping async MR +pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + initial_pool_size=2**30 +) +``` + +This combines the benefits of both: fast suballocation from RMM's pool and the driver's virtual addressing capabilities. + +### ManagedMemoryResource + +The `ManagedMemoryResource` uses CUDA unified memory (via `cudaMallocManaged`), allowing memory to be accessible from both CPU and GPU. + +**Advantages:** +- Enables working with datasets larger than GPU memory +- Automatic page migration between CPU and GPU +- Simplifies memory management for host/device code + +**Disadvantages:** +- Performance overhead due to page faults and migration +- Requires careful prefetching for optimal performance + +**Example:** +```python +import rmm + +rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) +``` + +**When to use:** +- Datasets larger than available GPU memory +- Prototyping or applications where performance is not critical +- Always combine with prefetching strategies (see [Managed Memory guide](managed_memory.md)) + +### ArenaMemoryResource + +The `ArenaMemoryResource` divides a large allocation into size-binned arenas, reducing fragmentation. + +**Advantages:** +- Better fragmentation characteristics than basic pool +- Good for mixed allocation sizes +- Predictable performance + +**Disadvantages:** +- More complex configuration +- May waste memory if bin sizes don't match allocation patterns + +**Example:** +```python +import rmm + +arena = rmm.mr.ArenaMemoryResource( + rmm.mr.CudaMemoryResource(), + arena_size=2**28 # 256 MiB arenas +) +rmm.mr.set_current_device_resource(arena) +``` + +**When to use:** +- Applications with diverse allocation sizes +- Long-running services with complex allocation patterns +- When fragmentation is observed with pool allocators + +## Composing Memory Resources + +Memory resources can be composed (wrapped) to combine their properties. The general pattern is: + +```python +# Adaptor wrapping a base resource +adaptor = rmm.mr.SomeAdaptor(base_resource) +``` + +### Common Compositions + +**Prefetching with managed memory:** +```python +import rmm + +# Prefetch adaptor wrapping managed memory pool +base = rmm.mr.ManagedMemoryResource() +pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) +prefetch = rmm.mr.PrefetchResourceAdaptor(pool) +rmm.mr.set_current_device_resource(prefetch) +``` + +**Statistics tracking:** +```python +import rmm + +# Track allocation statistics +base = rmm.mr.CudaAsyncMemoryResource() +stats = rmm.mr.StatisticsResourceAdaptor(base) +rmm.mr.set_current_device_resource(stats) +``` + +## Multi-Library Applications + +When using RMM with multiple GPU libraries (e.g., cuDF, PyTorch, CuPy), `CudaAsyncMemoryResource` is especially important because: + +1. The driver-managed pool is shared automatically across all libraries +2. You don't need to configure every library to use RMM +3. Memory is not artificially partitioned between libraries + +**Example: RMM + PyTorch** +```python +import rmm +import torch +from rmm.allocators.torch import rmm_torch_allocator + +# Use async MR as the base +rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + +# Configure PyTorch to use RMM +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +``` + +With this setup, both PyTorch and any other RMM-using code (like cuDF) will share the same driver-managed pool. + +## Performance Considerations + +### Async MR vs. Pool MR + +In most cases, `CudaAsyncMemoryResource` provides similar or better performance than `PoolMemoryResource`: + +- Both use pooling for fast suballocation +- Async MR uses virtual addressing to avoid fragmentation +- Async MR shares memory across applications + +**When Pool MR might be faster:** +- Very specific allocation patterns that align well with pool design +- Custom upstream resources (not CUDA memory) + +### Multi-stream Applications + +For applications using multiple CUDA streams or threads: + +- `CudaAsyncMemoryResource` is **strongly recommended** +- Pool allocators can create "pipeline bubbles" where streams wait for allocations +- The async MR handles stream synchronization efficiently + +## Best Practices + +1. **Set the memory resource before any allocations**: Once memory is allocated, changing the resource can lead to crashes + + ```python + import rmm + + # Do this first, before any GPU allocations + rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + ``` + +2. **Prefer async MR by default**: Unless you have specific requirements, start with `CudaAsyncMemoryResource` + +3. **Use statistics for tuning**: If you need to understand allocation patterns, wrap with `StatisticsResourceAdaptor` + +4. **Don't over-engineer**: Start simple, profile, and optimize only if needed + +## See Also + +- [Pool Allocators](pool_allocators.md) - Detailed guide on pool and arena allocators +- [Managed Memory](managed_memory.md) - Guide to using managed memory and prefetching +- [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding stream-ordered semantics diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index b6923257b..6ab79fdc2 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -1,338 +1,543 @@ -# User Guide +# Programming Guide -Achieving optimal performance in GPU-centric workflows frequently requires -customizing how GPU ("device") memory is allocated. +This guide covers using RMM in C++ and Python applications, including memory resources, containers, and library integrations. -RMM is a package that enables you to allocate device memory -in a highly configurable way. For example, it enables you to -allocate and use pools of GPU memory, or to use -[managed memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) -for allocations. +## Basic Example -You can also easily configure other libraries like Numba and CuPy -to use RMM for allocating device memory. +`````{tabs} +````{code-tab} c++ +#include +#include +#include +#include -## Installation +int main() { + // Use async MR (recommended) + rmm::mr::cuda_async_memory_resource async_mr; + rmm::mr::set_current_device_resource_ref(async_mr); -See the project [README](https://github.com/rapidsai/rmm) for how to install RMM. + // Allocate device memory + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view()); -## Using RMM + std::cout << "Allocated " << buffer.size() << " bytes\n"; -There are two ways to use RMM in Python code: + return 0; +} +```` +````{code-tab} python +import rmm -1. Using the `rmm.DeviceBuffer` API to explicitly create and manage - device memory allocations -2. Transparently via external libraries such as CuPy and Numba +# Use async MR (recommended) +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) -RMM provides a `MemoryResource` abstraction to control _how_ device -memory is allocated in both the above uses. +# Allocate device memory +buffer = rmm.DeviceBuffer(size=1024) -### `DeviceBuffer` Objects +print(f"Allocated {buffer.size} bytes at {hex(buffer.ptr)}") +```` +````` -A `DeviceBuffer` represents an **untyped, uninitialized device memory -allocation**. `DeviceBuffer`s can be created by providing the -size of the allocation in bytes: +## Memory Resources -```python ->>> import rmm ->>> buf = rmm.DeviceBuffer(size=100) -``` +Memory resources control how device memory is allocated. RMM provides several resource types optimized for different use cases. -The size of the allocation and the memory address associated with it -can be accessed via the `.size` and `.ptr` attributes respectively: +### Setting the Current Resource -```python ->>> buf.size -100 ->>> buf.ptr -140202544726016 -``` +The current device resource is used by default for all allocations: -`DeviceBuffer`s can also be created by copying data from host memory: +`````{tabs} +````{code-tab} c++ +#include +#include -```python ->>> import rmm ->>> import numpy as np ->>> a = np.array([1, 2, 3], dtype='float64') ->>> buf = rmm.DeviceBuffer.to_device(a.view("uint8")) # to_device expects an unsigned 8-bit dtype ->>> buf.size -24 -``` +// Get current device resource ref +rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); -Conversely, the data underlying a `DeviceBuffer` can be copied to the host: +// Set current device resource ref +rmm::mr::cuda_async_memory_resource async_mr; +rmm::mr::set_current_device_resource_ref(async_mr); +```` +````{code-tab} python +import rmm -```python ->>> np.frombuffer(buf.tobytes()) -array([1., 2., 3.]) -``` +# Get current device resource +mr = rmm.mr.get_current_device_resource() -#### Prefetching a `DeviceBuffer` +# Set current device resource +async_mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(async_mr) +```` +````` -[CUDA Unified Memory]( - https://developer.nvidia.com/blog/unified-memory-cuda-beginners/ -), also known as managed memory, can be allocated using an -`rmm.mr.ManagedMemoryResource` explicitly, or by calling `rmm.reinitialize` -with `managed_memory=True`. +> **Warning**: The default resource must be set **before** allocating any device memory on that device. Setting or changing the resource after device allocations have been made can lead to unexpected behavior or crashes. -A `DeviceBuffer` backed by managed memory or other -migratable memory (such as -[HMM/ATS](https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/) -memory) may be prefetched to a specified device, for example to reduce or eliminate page faults. +### Available Resources -```python ->>> import rmm ->>> rmm.reinitialize(managed_memory=True) ->>> buf = rmm.DeviceBuffer(size=100) ->>> buf.prefetch() -``` +RMM provides several memory resource implementations: -The above example prefetches the `DeviceBuffer` memory to the current CUDA device -on the stream that the `DeviceBuffer` last used (e.g. at construction). The -destination device ID and stream are optional parameters. +| Resource | Description | Use Case | +|----------|-------------|----------| +| `CudaAsyncMemoryResource` | Uses `cudaMallocAsync` (driver-managed pool) | **Recommended default** | +| `CudaMemoryResource` | Uses `cudaMalloc`/`cudaFree` | Simple, no pooling | +| `ManagedMemoryResource` | Uses `cudaMallocManaged` (unified memory) | Datasets larger than GPU memory | +| `PoolMemoryResource` | Coalescing pool over upstream resource | Custom pool configuration | +| `ArenaMemoryResource` | Size-binned arenas | Mixed allocation sizes | -```python ->>> import rmm ->>> rmm.reinitialize(managed_memory=True) ->>> from rmm.pylibrmm.stream import Stream ->>> stream = Stream() ->>> buf = rmm.DeviceBuffer(size=100, stream=stream) ->>> buf.prefetch(device=3, stream=stream) # prefetch to device on stream. -``` +`````{tabs} +````{code-tab} c++ +#include +#include +#include +#include -`DeviceBuffer.prefetch()` is a no-op if the `DeviceBuffer` is not backed -by migratable memory. +// CudaMemoryResource - uses cudaMalloc/cudaFree +auto cuda_mr = rmm::mr::cuda_memory_resource{}; -`rmm.pylibrmm.stream.Stream` implements the [CUDA Stream Protocol](https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol), so it can be used with -`cuda.core.`. +// CudaAsyncMemoryResource - uses cudaMallocAsync (recommended) +auto async_mr = rmm::mr::cuda_async_memory_resource{}; -```python ->>> from cuda.core import Device ->>> import rmm.pylibrmm.stream ->>> device = Device() ->>> device.set_current() ->>> rmm_stream = rmm.pylibrmm.stream.Stream() +// ManagedMemoryResource - uses cudaMallocManaged +auto managed_mr = rmm::mr::managed_memory_resource{}; ->>> cuda_stream = device.create_stream(rmm_stream) -``` +// PoolMemoryResource - coalescing pool with 1 GiB initial size +rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; +```` +````{code-tab} python +import rmm -### `MemoryResource` objects +# CudaMemoryResource - uses cudaMalloc/cudaFree +cuda_mr = rmm.mr.CudaMemoryResource() -`MemoryResource` objects are used to configure how device memory allocations are made by -RMM. +# CudaAsyncMemoryResource - uses cudaMallocAsync (recommended) +async_mr = rmm.mr.CudaAsyncMemoryResource() -By default if a `MemoryResource` is not set explicitly, RMM uses the `CudaMemoryResource`, which -uses `cudaMalloc` for allocating device memory. +# ManagedMemoryResource - uses cudaMallocManaged +managed_mr = rmm.mr.ManagedMemoryResource() -`rmm.reinitialize()` provides an easy way to initialize RMM with specific memory resource options -across multiple devices. See `help(rmm.reinitialize)` for full details. +# PoolMemoryResource - coalescing pool with 1 GiB initial size +pool_mr = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), + initial_pool_size=2**30 # 1 GiB +) +```` +````` -For lower-level control, the `rmm.mr.set_current_device_resource()` function can be -used to set a different MemoryResource for the current CUDA device. For -example, enabling the `ManagedMemoryResource` tells RMM to use -`cudaMallocManaged` instead of `cudaMalloc` for allocating memory: +See [Choosing a Memory Resource](choosing_memory_resources.md) for detailed guidance. -```python ->>> import rmm ->>> rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) -``` +### Per-Device Resources -> :warning: The default resource must be set for any device **before** -> allocating any device memory on that device. Setting or changing the -> resource after device allocations have been made can lead to unexpected -> behaviour or crashes. +For multi-GPU systems, each device can have its own resource: -As another example, `PoolMemoryResource` allows you to allocate a -large "pool" of device memory up-front. Subsequent allocations will -draw from this pool of already allocated memory. The example -below shows how to construct a PoolMemoryResource with an initial size -of 1 GiB and a maximum size of 4 GiB. The pool uses -`CudaMemoryResource` as its underlying ("upstream") memory resource: +`````{tabs} +````{code-tab} c++ +#include +#include -```python ->>> import rmm ->>> pool = rmm.mr.PoolMemoryResource( -... rmm.mr.CudaMemoryResource(), -... initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30 -... maximum_pool_size="4GiB" -... ) ->>> rmm.mr.set_current_device_resource(pool) -``` +// Get per-device resource ref +rmm::device_async_resource_ref mr0 = rmm::mr::get_per_device_resource_ref(rmm::cuda_device_id{0}); -Similarly, to use a pool of managed memory: +// Set per-device resource ref +rmm::mr::cuda_async_memory_resource async_mr; +rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{0}, async_mr); +```` +````{code-tab} python +import rmm -```python ->>> import rmm ->>> pool = rmm.mr.PoolMemoryResource( -... rmm.mr.ManagedMemoryResource(), -... initial_pool_size="1GiB", -... maximum_pool_size="4GiB" -... ) ->>> rmm.mr.set_current_device_resource(pool) +# Get per-device resource +mr0 = rmm.mr.get_per_device_resource(0) + +# Set per-device resource +async_mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_per_device_resource(0, async_mr) +```` +````` + +## Containers + +RMM provides RAII containers that automatically manage device memory lifetime. + +### DeviceBuffer + +Untyped, uninitialized device memory: + +`````{tabs} +````{code-tab} c++ +#include + +rmm::cuda_stream stream; + +// Allocate 1024 bytes +rmm::device_buffer buffer(1024, stream.view()); + +// Access pointer and size +void* ptr = buffer.data(); +std::size_t size = buffer.size(); + +// Resize (may reallocate) +buffer.resize(2048, stream.view()); + +// Copy construct (deep copy) +rmm::device_buffer buffer2(buffer, stream.view()); +```` +````{code-tab} python +import rmm + +# Allocate 1024 bytes +buffer = rmm.DeviceBuffer(size=1024) + +# Access pointer and size +ptr = buffer.ptr +size = buffer.size + +# Resize (may reallocate) +buffer.resize(2048) + +# Copy construct (deep copy) +buffer2 = buffer.copy() +```` +````` + +### device_uvector (C++) + +Typed, uninitialized device vector for trivially copyable types: + +```cpp +#include +#include +#include + +rmm::cuda_stream stream; + +// Allocate 100 elements +rmm::device_uvector vec(100, stream.view()); + +// Access as pointer +int* ptr = vec.data(); + +// Access as iterators +auto begin = vec.begin(); +auto end = vec.end(); + +// Initialize with Thrust +thrust::fill(rmm::exec_policy(stream.view()), vec.begin(), vec.end(), 42); + +// Resize +vec.resize(200, stream.view()); ``` -Other `MemoryResource`s include: +### device_scalar (C++) -* `FixedSizeMemoryResource` for allocating fixed blocks of memory -* `BinningMemoryResource` for allocating blocks within specified "bin" sizes from different memory -resources +Single typed element with host-device transfer convenience: -`MemoryResource`s are highly configurable and can be composed together in different ways. -See `help(rmm.mr)` for more information. +```cpp +#include -## Using RMM with third-party libraries +rmm::cuda_stream stream; -A number of libraries provide hooks to control their device -allocations. RMM provides implementations of these for -[CuPy](https://cupy.dev), -[numba](https://numba.readthedocs.io/en/stable/), and [PyTorch](https://pytorch.org) in the -`rmm.allocators` submodule. All these approaches configure the library -to use the _current_ RMM memory resource for device -allocations. +// Allocate single int +rmm::device_scalar scalar(stream.view()); -### Using RMM with CuPy +// Set value from host (async on stream) +scalar.set_value(42, stream.view()); -You can configure [CuPy](https://cupy.dev/) to use RMM for memory -allocations by setting the CuPy CUDA allocator to -`rmm.allocators.cupy.rmm_cupy_allocator`: +// Get value to host (async on stream) +int value = scalar.value(stream.view()); -```python ->>> from rmm.allocators.cupy import rmm_cupy_allocator ->>> import cupy ->>> cupy.cuda.set_allocator(rmm_cupy_allocator) +// Access device pointer +int* d_ptr = scalar.data(); + +// Pass to kernel +launch_kernel<<<..., stream.value()>>>(scalar.data()); ``` -### Using RMM with Numba +## Resource Adaptors -You can configure [Numba](https://numba.readthedocs.io/en/stable/) to use RMM for memory allocations using the -Numba [EMM Plugin](https://numba.readthedocs.io/en/stable/cuda/external-memory.html#setting-emm-plugin). +Adaptors wrap resources to add functionality like statistics tracking and logging. -This can be done in two ways: +### Statistics Tracking -1. Setting the environment variable `NUMBA_CUDA_MEMORY_MANAGER`: +`````{tabs} +````{code-tab} c++ +#include +#include - ```bash - $ NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python (args) - ``` +rmm::mr::cuda_async_memory_resource cuda_mr; +rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; +rmm::mr::set_current_device_resource_ref(stats_mr); -2. Using the `set_memory_manager()` function provided by Numba: +// Allocate +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view()); - ```python - >>> from numba import cuda - >>> from rmm.allocators.numba import RMMNumbaManager - >>> cuda.set_memory_manager(RMMNumbaManager) - ``` +// Get statistics +auto bytes = stats_mr.get_bytes_counter(); +std::cout << "Current bytes: " << bytes.value << "\n"; +std::cout << "Peak bytes: " << bytes.peak << "\n"; +std::cout << "Total bytes: " << bytes.total << "\n"; +```` +````{code-tab} python +import rmm -### Using RMM with PyTorch +# Wrap base resource with statistics adaptor +cuda_mr = rmm.mr.CudaAsyncMemoryResource() +stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) +rmm.mr.set_current_device_resource(stats_mr) -You can configure -[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) to use RMM -for memory allocations using their by configuring the current -allocator. +# Allocate +buffer = rmm.DeviceBuffer(size=1024) -```python ->>> from rmm.allocators.torch import rmm_torch_allocator ->>> import torch +# Get statistics +stats = stats_mr.allocation_counts +print(f"Current bytes: {stats.current_bytes}") +print(f"Peak bytes: {stats.peak_bytes}") +print(f"Total bytes: {stats.total_bytes}") +```` +````` ->>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator) -``` +### Logging -## Memory statistics and profiling +`````{tabs} +````{code-tab} c++ +#include +#include -RMM can profile memory usage and track memory statistics by using either of the following: - - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block. - - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally. +rmm::mr::cuda_async_memory_resource cuda_mr; +rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "allocations.csv"}; +rmm::mr::set_current_device_resource_ref(log_mr); -Common to both usages is that they modify the currently active RMM memory resource. The current device resource is wrapped with a `StatisticsResourceAdaptor` which must remain the topmost resource throughout the statistics tracking: -```python ->>> import rmm ->>> import rmm.statistics - ->>> # We start with the default CUDA memory resource ->>> rmm.mr.get_current_device_resource() - - ->>> # When using statistics, we get a StatisticsResourceAdaptor with the context ->>> with rmm.statistics.statistics(): -... rmm.mr.get_current_device_resource() - - ->>> # We can also enable statistics globally ->>> rmm.statistics.enable_statistics() ->>> print(rmm.mr.get_current_device_resource()) - +// All allocations logged to CSV +rmm::device_buffer buffer(1024, rmm::cuda_stream_default); +```` +````{code-tab} python +import rmm + +# Wrap the current resource with logging adaptor +base = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="allocations.csv") +rmm.mr.set_current_device_resource(log_mr) + +# All allocations logged to CSV +buffer = rmm.DeviceBuffer(size=1024) +```` +````` + +CSV format: `Thread,Time,Action,Pointer,Size,Stream` + +See [Logging and Profiling](logging.md) for more details. + +### Composing Resources + +Adaptors can be stacked to combine functionality: + +`````{tabs} +````{code-tab} c++ +#include +#include +#include +#include +#include + +// Base resource +rmm::mr::cuda_async_memory_resource cuda_mr; + +// Add pool +rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; + +// Add statistics +rmm::mr::statistics_resource_adaptor stats_mr{pool_mr}; + +// Add logging +rmm::mr::logging_resource_adaptor log_mr{stats_mr, "log.csv"}; + +// Set as current +rmm::mr::set_current_device_resource_ref(log_mr); +```` +````{code-tab} python +import rmm + +# Base resource +cuda_mr = rmm.mr.CudaAsyncMemoryResource() + +# Add pool +pool_mr = rmm.mr.PoolMemoryResource(cuda_mr, initial_pool_size=2**30) + +# Add statistics +stats_mr = rmm.mr.StatisticsResourceAdaptor(pool_mr) + +# Add logging +log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") + +# Set as current +rmm.mr.set_current_device_resource(log_mr) +```` +````` + +Order matters: outer adaptors see all allocations from inner resources. + +## Library Integrations + +### Thrust (C++) + +Use `rmm::exec_policy` to make Thrust algorithms use RMM for temporary storage: + +```cpp +#include +#include +#include +#include + +rmm::cuda_stream stream; +rmm::device_uvector vec(1000, stream.view()); + +// Fill with descending values +thrust::sequence(rmm::exec_policy(stream.view()), + vec.begin(), vec.end(), vec.size() - 1, -1); + +// Sort using current device resource for temporary storage +thrust::sort(rmm::exec_policy(stream.view()), vec.begin(), vec.end()); + +// Or use a specific memory resource for temporary storage +rmm::mr::cuda_async_memory_resource custom_mr; +thrust::sort(rmm::exec_policy(stream.view(), custom_mr), vec.begin(), vec.end()); + +stream.synchronize(); ``` -With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource: +### CuPy (Python) + +Configure CuPy to use RMM for all device memory allocations: + ```python ->>> buf = rmm.DeviceBuffer(size=10) ->>> rmm.statistics.get_statistics() -Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1) +import rmm +import cupy as cp +from rmm.allocators.cupy import rmm_cupy_allocator + +# Configure RMM +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) + +# Set CuPy to use RMM +cp.cuda.set_allocator(rmm_cupy_allocator) + +# All CuPy arrays now use RMM +array = cp.zeros(1000) ``` -### Memory Profiler -To profile a specific block of code, first enable memory statistics by calling `rmm.statistics.enable_statistics()`. To profile a function, use `profiler` as a function decorator: -```python ->>> @rmm.statistics.profiler() -... def f(size): -... rmm.DeviceBuffer(size=size) ->>> f(1000) +### Numba (Python) ->>> # By default, the profiler write to rmm.statistics.default_profiler_records ->>> print(rmm.statistics.default_profiler_records.report()) -Memory Profiling -================ +Configure Numba to use RMM for device memory in CUDA JIT-compiled functions: -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) +```python +from numba import cuda +from rmm.allocators.numba import RMMNumbaManager +import rmm -Ordered by: memory_peak +# Configure RMM +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) -ncalls memory_peak memory_total filename:lineno(function) - 1 1,008 1,008 :1(f) +# Set Numba to use RMM +cuda.set_memory_manager(RMMNumbaManager) ``` -To profile a code block, use `profiler` as a context manager: -```python ->>> with rmm.statistics.profiler(name="my code block"): -... rmm.DeviceBuffer(size=20) ->>> print(rmm.statistics.default_profiler_records.report()) -Memory Profiling -================ - -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) - -Ordered by: memory_peak - -ncalls memory_peak memory_total filename:lineno(function) - 1 1,008 1,008 :1(f) - 1 32 32 my code block +Or use the environment variable: + +```bash +NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python script.py ``` -The `profiler` supports nesting: +### PyTorch (Python) + +Configure PyTorch to use RMM for CUDA tensor allocations: + ```python ->>> with rmm.statistics.profiler(name="outer"): -... buf1 = rmm.DeviceBuffer(size=10) -... with rmm.statistics.profiler(name="inner"): -... buf2 = rmm.DeviceBuffer(size=10) ->>> print(rmm.statistics.default_profiler_records.report()) -Memory Profiling -================ - -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) - -Ordered by: memory_peak - -ncalls memory_peak memory_total filename:lineno(function) - 1 1,008 1,008 :1(f) - 1 32 32 my code block - 1 32 32 outer - 1 16 16 inner +import rmm +import torch +from rmm.allocators.torch import rmm_torch_allocator + +# Configure RMM +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) + +# Set PyTorch to use RMM +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + +# All PyTorch tensors now use RMM +tensor = torch.zeros(1000, device='cuda') ``` + +## Multi-Device Usage + +`````{tabs} +````{code-tab} c++ +#include +#include +#include +#include +#include + +int num_devices; +cudaGetDeviceCount(&num_devices); + +// Store resources to maintain lifetime +std::vector> resources; + +for (int i = 0; i < num_devices; ++i) { + // Set device BEFORE creating resource + cudaSetDevice(i); + + // Create resource for this device + resources.push_back(std::make_unique()); + + // Set as per-device resource ref + rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{i}, *resources.back()); +} + +// Use device 0 +cudaSetDevice(0); +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view()); // Uses device 0's resource +```` +````{code-tab} python +import rmm +from cuda import cuda + +num_devices = cuda.cuDeviceGetCount()[1] + +# Store resources to maintain lifetime +resources = [] + +for device_id in range(num_devices): + # Create resource for this device + mr = rmm.mr.CudaAsyncMemoryResource() + resources.append(mr) + + # Set as per-device resource + rmm.mr.set_per_device_resource(device_id, mr) + +# Use device 0 +buffer = rmm.DeviceBuffer(size=1024) # Uses device 0's resource +```` +````` + +## Best Practices + +1. **Use `CudaAsyncMemoryResource` by default** - best performance for most workloads + +2. **Set resources before any allocations** - changing resources after allocation can cause crashes + +3. **Maintain resource lifetime** - resources must outlive any allocations from them + +4. **Use RAII containers** - prefer `device_buffer` over raw pointers + +5. **Profile and measure** - use statistics and logging to understand allocation patterns + +## See Also + +- [Choosing a Memory Resource](choosing_memory_resources.md) +- [Stream-Ordered Allocation](stream_ordered_allocation.md) +- [Managed Memory and Prefetching](managed_memory.md) +- [Pool Allocators](pool_allocators.md) +- [Logging and Profiling](logging.md) diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md new file mode 100644 index 000000000..bafb4d1e6 --- /dev/null +++ b/docs/user_guide/index.md @@ -0,0 +1,14 @@ +# User Guide + +```{toctree} +:maxdepth: 2 + +introduction +installation +guide +choosing_memory_resources +stream_ordered_allocation +managed_memory +pool_allocators +logging +``` diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md new file mode 100644 index 000000000..145aa8156 --- /dev/null +++ b/docs/user_guide/installation.md @@ -0,0 +1,181 @@ +# Installation + +This guide covers installing RMM. For general RAPIDS installation instructions, which includes RMM, see the [RAPIDS Installation Guide](https://docs.rapids.ai/install/). + +## System Requirements + +- **Operating System**: Linux or Windows Subsystem for Linux 2 (WSL2) +- **Python**: 3.10, 3.11, 3.12, or 3.13 +- **CUDA**: 12.2 or later +- **GPU**: Volta architecture or newer (Compute Capability 7.0+) + +## Installing with conda + +The easiest way to install RMM and all of its dependencies is using conda. You can get a minimal conda installation with [miniforge](https://conda-forge.org/download/). + +### Stable Release + +Install the latest stable release: + +```bash +conda install -c rapidsai -c conda-forge rmm cuda-version=13 +``` + +### Nightly Builds + +For the latest development version, install from the nightly channel: + +```bash +conda install -c rapidsai-nightly -c conda-forge rmm cuda-version=13 +``` + +Nightly builds are created from the `main` branch and may contain unreleased features or bug fixes. + +## Installing with pip + +RMM can also be installed using pip, but requires that CUDA is already installed on your system. + +```bash +pip install rmm-cu13 # For CUDA 13 +# or +pip install rmm-cu12 # For CUDA 12 +``` + +## Building from Source + +Building from source gives you the latest features and allows you to customize the build. + +### Development Environment + +For a complete development environment, you can create an environment with all dependencies: + +```bash +# Clone the repository +git clone https://github.com/rapidsai/rmm.git +cd rmm + +# Create environment for CUDA 13 +conda env create --name rmm_env --file conda/environments/all_cuda-130_arch-$(uname -m).yaml + +# Activate the environment +conda activate rmm_env +``` + +### Prerequisites + +- **GCC**: 13 or later +- **nvcc**: CUDA 12.2 or later +- **CMake**: 3.30.4 or later + +### Build Steps + +#### Clone the Repository + +```bash +git clone https://github.com/rapidsai/rmm.git +cd rmm +``` + +#### Create Conda Development Environment + +```bash +# For CUDA 13 +conda env create --name rmm_dev --file conda/environments/all_cuda-130_arch-$(uname -m).yaml + +# Activate the environment +conda activate rmm_dev +``` + +#### Build Using build.sh + +RMM provides a convenience script `build.sh` that handles the build process. +The `build.sh` script is meant to be used with the developer conda environment above, which installs all prerequisites. + +```bash +# Show help +./build.sh -h + +# Build librmm without installing +./build.sh -n librmm + +# Build rmm Python package without installing +./build.sh -n rmm + +# Build and install both +./build.sh librmm rmm +``` + +## Using RMM in a Downstream CMake Project + +To use RMM in your own CMake project, add the following to your `CMakeLists.txt`: + +```cmake +find_package(rmm REQUIRED) + +# Link your target with RMM (header-only, pulls in dependencies) +target_link_libraries(your_target PRIVATE rmm::rmm) +``` + +If RMM is not installed in a default location, specify its path: + +```bash +cmake .. -Drmm_ROOT=/path/to/rmm/install +``` + +### Using CPM to Fetch RMM + +You can use CPM to fetch RMM as a dependency: + +```cmake +include(CPM) + +CPMAddPackage( + NAME rmm + VERSION 26.02 + GITHUB_REPOSITORY rapidsai/rmm + GIT_TAG main + SOURCE_SUBDIR cpp +) + +target_link_libraries(your_target PRIVATE rmm::rmm) +``` + +## Testing Installation + +### C++ + +Create a test file `test_rmm.cpp`: + +```cpp +#include +#include +#include + +int main() { + auto mr = rmm::mr::cuda_memory_resource{}; + rmm::mr::set_current_device_resource(&mr); + + rmm::device_buffer buf(100); + std::cout << "Allocated " << buf.size() << " bytes\n"; + + return 0; +} +``` + +Compile and run: + +```bash +nvcc -std=c++17 -I/path/to/rmm/include test_rmm.cpp -o test_rmm +./test_rmm +``` + +### Python + +```python +import rmm +print(rmm.__version__) + +# Quick test +buffer = rmm.DeviceBuffer(size=100) +print(f"Allocated {buffer.size} bytes") +``` diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md new file mode 100644 index 000000000..936137620 --- /dev/null +++ b/docs/user_guide/introduction.md @@ -0,0 +1,152 @@ +# Introduction to RMM + +**RMM (RAPIDS Memory Manager)** is a library for allocating and managing GPU memory in C++ and Python. It provides a flexible interface for customizing how device memory is allocated, along with efficient implementations and containers. + +## Purpose + +Achieving optimal performance in GPU-accelerated applications frequently requires customizing memory allocation strategies. For example: + +- Using **memory pools** to reduce the overhead of dynamic allocation +- Using **managed memory** to work with datasets larger than GPU memory +- Using **pinned host memory** for faster asynchronous CPU ↔ GPU transfers +- Customizing allocation strategies for specific workload patterns + +RMM provides a unified interface, called a **memory resource**, which is a building block for GPU-accelerated applications. + +Memory resources provide a **minimal-overhead abstraction** over memory allocation that is **pluggable at runtime**, making it possible to debug, measure performance, and optimize a CUDA application without recompiling. +Memory resources aim to serve the needs of a wide range of applications, from data science and machine learning to high-performance simulation. + +RMM's memory resources leverage CUDA features like **stream-ordered** (asynchronous) pipeline parallelism, **managed** memory (also known as unified virtual memory, UVM), and **pinned** memory, making it easier to write complex workflows that optimally use both device and host memory. +The integrations provided in RMM allow memory resources to benefit memory management across libraries frequently used together, such as **PyTorch** and **RAPIDS**. + +## Key Features + +RMM is built around three main concepts. + +### 1. Memory Resources + +Memory resources provide a common abstraction for device memory allocation. +The API of RMM's memory resources is based on the CCCL memory resource design to facilitate interoperability. + +The choice of resource determines the underlying type of memory and thus its accessibility from host or device. +For example, the `cuda_async_memory_resource` uses a pool of memory managed by the CUDA driver. +This resource is recommended for most applications, because of its performance and support for asynchrous (stream-ordered) allocations. See [Stream-Ordered Allocation](stream_ordered_allocation.md) for details. +As another example, the `managed_memory_resource` provides unified memory for CPU+GPU, and is recommended for applications exceeding the available GPU memory. + +See [Choosing a Memory Resource](choosing_memory_resources.md) for guidance on the available memory resources, performance considerations, and how they fit into efficient CUDA application design strategies. +[NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) can be used to profile memory resource performance. + +### 2. Resource Adaptors + +Resource adaptors wrap and add functionality to existing resources. +For example, the `statistics_resource_adaptor` can be used to track allocation statistics. +The `logging_resource_adaptor` logs allocations to a CSV file. +Adaptors are composable - wrap multiple adaptors for combined functionality. + +### 3. Containers + +RMM provides [RAII](https://en.cppreference.com/w/cpp/language/raii.html) container classes that manage memory lifetime. +Using these containers avoids common problems with performing raw allocation such as memory leaks or improper stream ordering. +- `device_buffer`: Untyped device memory +- `device_uvector`: Typed, uninitialized vector of device memory (trivially copyable types) +- `device_scalar`: Single typed element + +All containers use stream-ordered allocation and work with any memory resource. + +## Basic Example + +### C++ + +```cpp +#include +#include + +// Use CUDA async memory pool +auto async_mr = rmm::mr::cuda_async_memory_resource{}; +rmm::mr::set_current_device_resource(&async_mr); + +// Allocate device memory asynchronously +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view()); +stream.synchronize(); +``` + +### Python + +```python +import rmm +import cupy as cp + +# Create a CUDA async memory resource +mr = rmm.mr.CudaAsyncMemoryResource() + +# Set the current device memory resource +rmm.mr.set_current_device_resource(mr) + +# Allocating device memory uses the current device resource by default +buffer = rmm.DeviceBuffer(size=1024) + +# Use the current device resource with CuPy +cp.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator) +array = cp.zeros(1000) # Now uses RMM for allocation +``` + +## Integration with GPU Libraries + +RMM integrates seamlessly with popular GPU libraries: + +### PyTorch + +Set the PyTorch allocator to use the current device resource: + +```python +import rmm +import torch +from rmm.allocators.torch import rmm_torch_allocator + +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +``` + +### CuPy + +Set the CuPy allocator to use the current device resource: + +```python +import rmm +import cupy +from rmm.allocators.cupy import rmm_cupy_allocator + +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) +cupy.cuda.set_allocator(rmm_cupy_allocator) +``` + +### Numba + +When launching a script: +```bash +NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python script.py +``` + +Or from Python: + +```python +import rmm +from numba import cuda +from rmm.allocators.numba import RMMNumbaManager + +mr = rmm.mr.CudaAsyncMemoryResource() +rmm.mr.set_current_device_resource(mr) +cuda.set_memory_manager(RMMNumbaManager) +``` + +## Resources and Support + +- [RMM GitHub Repository](https://github.com/rapidsai/rmm): Source code and development +- [RMM Issue Tracker](https://github.com/rapidsai/rmm/issues): Report bugs or request features +- [RAPIDS Documentation](https://docs.rapids.ai): RAPIDS ecosystem docs +- [RAPIDS Installation Guide](https://docs.rapids.ai/install): Installation instructions +- [Developer Blog: Fast, Flexible Allocation](https://developer.nvidia.com/blog/fast-flexible-allocation-for-cuda-with-rapids-memory-manager/): RMM design walkthrough +- [Developer Blog: Stream-Ordered Allocation](https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/): Deep dive into stream-ordered semantics diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md new file mode 100644 index 000000000..e097f34cb --- /dev/null +++ b/docs/user_guide/logging.md @@ -0,0 +1,575 @@ +# Logging and Profiling + +RMM provides two types of logging: **memory event logging** for tracking allocations and deallocations, and **debug logging** for troubleshooting internal behavior. + +## Memory Event Logging + +Memory event logging writes details of every allocation and deallocation to a CSV file. This is useful for: +- Debugging memory issues +- Understanding allocation patterns +- Profiling memory usage +- Replaying workloads for benchmarking + +### Python: Using Memory Event Logging + +Enable logging by wrapping your memory resource with `LoggingResourceAdaptor`: + +```python +import rmm + +# Wrap the current resource with logging adaptor +base = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="memory_log.csv") +rmm.mr.set_current_device_resource(log_mr) + +# Allocations are now logged +buffer1 = rmm.DeviceBuffer(size=1024) +buffer2 = rmm.DeviceBuffer(size=2048) + +# All allocations/deallocations written to memory_log.csv +``` + +If `log_file_name` is not provided, the environment variable `RMM_LOG_FILE` is used: + +```bash +export RMM_LOG_FILE="allocations.csv" +python script.py +``` + +### C++: Using logging_resource_adaptor + +Wrap any memory resource with `logging_resource_adaptor`: + +```cpp +#include +#include + +int main() { + // Create upstream resource + auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; + + // Wrap with logging adaptor + auto log_mr = rmm::mr::logging_resource_adaptor{&cuda_mr, "memory_log.csv"}; + + // Set as current resource + rmm::mr::set_current_device_resource(&log_mr); + + // All allocations logged to CSV + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view()); + + return 0; +} +``` + +If filename is not provided, `RMM_LOG_FILE` environment variable is checked: + +```bash +export RMM_LOG_FILE="allocations.csv" +./my_app +``` + +### CSV Log Format + +Each row represents an allocation or deallocation with the following columns: + +``` +Thread,Time,Action,Pointer,Size,Stream +``` + +Example: +``` +Thread,Time,Action,Pointer,Size,Stream +140573312345856,1634567890.123456,allocate,0x7f8a40000000,1024,0x7f8a38001020 +140573312345856,1634567890.234567,allocate,0x7f8a40000400,2048,0x7f8a38001020 +140573312345856,1634567890.345678,deallocate,0x7f8a40000000,1024,0x7f8a38001020 +``` + +- **Thread**: Thread ID performing the operation +- **Time**: Timestamp (seconds since epoch) +- **Action**: `allocate` or `deallocate` +- **Pointer**: Memory address +- **Size**: Allocation size in bytes +- **Stream**: CUDA stream pointer + +### Analyzing Logs + +You can parse and analyze logs with Python: + +```python +import pandas as pd + +# Read log file +df = pd.read_csv("memory_log.csv") + +# Total bytes allocated +total_allocated = df[df['Action'] == 'allocate']['Size'].sum() +print(f"Total allocated: {total_allocated:,} bytes") + +# Allocation size distribution +print(df[df['Action'] == 'allocate']['Size'].describe()) + +# Peak memory usage (simple analysis) +df['Delta'] = df.apply( + lambda row: row['Size'] if row['Action'] == 'allocate' else -row['Size'], + axis=1 +) +df['Cumulative'] = df['Delta'].cumsum() +peak = df['Cumulative'].max() +print(f"Peak usage: {peak:,} bytes") +``` + +### Replay Benchmark + +When building RMM from source, logs can be used with `REPLAY_BENCHMARK`: + +```bash +cd build/gbenchmarks +./REPLAY_BENCHMARK --log_file=memory_log.csv +``` + +This replays the allocation pattern from the log, useful for: +- Benchmarking different memory resources +- Testing allocator implementations +- Profiling allocation overhead + +## Memory Statistics + +RMM provides statistics tracking for allocations using `statistics_resource_adaptor`. + +### Python: Enabling Statistics + +```python +import rmm + +# Enable statistics globally +rmm.statistics.enable_statistics() + +# Or use context manager for specific code blocks +with rmm.statistics.statistics(): + buffer = rmm.DeviceBuffer(size=1024) + + # Get current statistics + stats = rmm.statistics.get_statistics() + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total allocations: {stats.total_count}") +``` + +Available statistics: + +```python +class Statistics: + current_bytes: int # Currently allocated bytes + current_count: int # Number of active allocations + peak_bytes: int # Peak bytes allocated + peak_count: int # Peak number of allocations + total_bytes: int # Total bytes ever allocated + total_count: int # Total number of allocations +``` + +### C++: Using statistics_resource_adaptor + +```cpp +#include +#include +#include + +int main() { + auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; + auto stats_mr = rmm::mr::statistics_resource_adaptor{&cuda_mr}; + rmm::mr::set_current_device_resource(&stats_mr); + + // Allocate + rmm::cuda_stream stream; + rmm::device_buffer buffer1(1024, stream.view()); + rmm::device_buffer buffer2(2048, stream.view()); + + // Get statistics + auto stats = stats_mr.get_statistics(); + std::cout << "Allocated bytes: " << stats.allocated_bytes << "\n"; + std::cout << "Allocation count: " << stats.num_allocations << "\n"; + + return 0; +} +``` + +### Tracking Memory Growth + +Monitor memory usage over time: + +```python +import rmm +import time + +rmm.statistics.enable_statistics() + +def checkpoint(label): + stats = rmm.statistics.get_statistics() + print(f"{label}:") + print(f" Current: {stats.current_bytes:,} bytes ({stats.current_count} allocations)") + print(f" Peak: {stats.peak_bytes:,} bytes") + +checkpoint("Start") + +# Allocate +buffers = [rmm.DeviceBuffer(size=1024*1024) for _ in range(10)] +checkpoint("After 10x1MB allocations") + +# Free some +buffers = buffers[:5] +checkpoint("After freeing 5") + +# Allocate more +buffers.extend([rmm.DeviceBuffer(size=2*1024*1024) for _ in range(5)]) +checkpoint("After 5x2MB allocations") +``` + +## Memory Profiling + +The memory profiler tracks allocations by function/code block. + +### Python: Using the Profiler + +#### Profiling Functions + +```python +import rmm + +# Enable statistics first +rmm.statistics.enable_statistics() + +# Profile a function +@rmm.statistics.profiler() +def process_data(size): + buffer = rmm.DeviceBuffer(size=size) + # ... processing ... + return buffer + +# Run function +process_data(1000000) + +# View report +print(rmm.statistics.default_profiler_records.report()) +``` + +Output: +``` +Memory Profiling +================ + +Legends: + ncalls - number of times the function or code block was called + memory_peak - peak memory allocated in function or code block (in bytes) + memory_total - total memory allocated in function or code block (in bytes) + +Ordered by: memory_peak + +ncalls memory_peak memory_total filename:lineno(function) + 1 1,000,016 1,000,016 script.py:5(process_data) +``` + +#### Profiling Code Blocks + +```python +import rmm + +rmm.statistics.enable_statistics() + +# Profile specific code blocks +with rmm.statistics.profiler(name="data loading"): + data = rmm.DeviceBuffer(size=1000000) + +with rmm.statistics.profiler(name="processing"): + buffer1 = rmm.DeviceBuffer(size=500000) + buffer2 = rmm.DeviceBuffer(size=500000) + +# View report +print(rmm.statistics.default_profiler_records.report()) +``` + +Output: +``` +ncalls memory_peak memory_total filename:lineno(function) + 1 1,000,016 1,000,016 data loading + 1 1,000,032 1,000,032 processing +``` + +#### Nested Profiling + +```python +import rmm + +rmm.statistics.enable_statistics() + +with rmm.statistics.profiler(name="outer"): + buffer1 = rmm.DeviceBuffer(size=1000) + + with rmm.statistics.profiler(name="inner"): + buffer2 = rmm.DeviceBuffer(size=2000) + + buffer3 = rmm.DeviceBuffer(size=500) + +print(rmm.statistics.default_profiler_records.report()) +``` + +Output shows both nested and total allocations: +``` +ncalls memory_peak memory_total filename:lineno(function) + 1 3,520 3,520 outer + 1 2,016 2,016 inner +``` + +### Custom Profiler Records + +Use custom profiler records for separate tracking: + +```python +import rmm + +rmm.statistics.enable_statistics() + +# Create custom profiler records +custom_records = rmm.statistics.profiler_records() + +# Use with context manager +with rmm.statistics.profiler(name="my operation", records=custom_records): + buffer = rmm.DeviceBuffer(size=1024) + +# View only custom records +print(custom_records.report()) +``` + +## Debug Logging + +RMM uses [rapids-logger](https://github.com/rapidsai/rapids-logger) for debug output. + +### Enabling Debug Logging + +Debug logs show internal RMM behavior, errors, and warnings. + +#### Output Location + +By default, logs go to stderr. Set `RMM_DEBUG_LOG_FILE` to write to a file: + +```bash +export RMM_DEBUG_LOG_FILE=/path/to/rmm_debug.log +``` + +#### Log Levels + +Set at **compile time** with CMake: + +```bash +cmake .. -DRMM_LOGGING_LEVEL=DEBUG +``` + +Available levels (increasing verbosity): +- `OFF` - No logging +- `CRITICAL` - Only critical errors +- `ERROR` - Errors +- `WARN` - Warnings and errors +- `INFO` - Informational messages (default) +- `DEBUG` - Detailed debug info +- `TRACE` - Very verbose tracing + +#### Runtime Log Level (Python) + +Even with verbose logging compiled in, you must enable it at runtime: + +```python +import rmm + +# Enable all logging down to TRACE level +rmm.set_logging_level("trace") + +# Now you'll see TRACE and DEBUG messages +``` + +Available Python levels: `"trace"`, `"debug"`, `"info"`, `"warn"`, `"error"`, `"critical"`, `"off"` + +#### Runtime Log Level (C++) + +```cpp +#include + +int main() { + // Enable all logging down to TRACE level + rmm::default_logger().set_level(rapids_logger::level_enum::trace); + + // Your code here + + return 0; +} +``` + +### What Gets Logged + +Debug logging shows: +- Memory resource initialization +- Allocation failures and errors +- Pool growth and shrinkage +- Stream synchronization events +- Multi-device operations +- Internal state changes + +Example debug output: +``` +[2024-01-15 10:30:45.123] [info] Initializing cuda_async_memory_resource +[2024-01-15 10:30:45.234] [debug] pool_memory_resource: allocated 1 GiB from upstream +[2024-01-15 10:30:45.345] [warn] Allocation of 10 GiB failed, pool exhausted +[2024-01-15 10:30:45.456] [debug] Growing pool by 2 GiB +``` + +## Combining Logging Features + +Use multiple logging features together: + +```python +import rmm + +# Enable memory event logging by wrapping with adaptor +base = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="events.csv") +rmm.mr.set_current_device_resource(log_mr) + +# Enable statistics and profiling +rmm.statistics.enable_statistics() + +# Set debug log level +rmm.set_logging_level("debug") + +# Now all logging is active +@rmm.statistics.profiler() +def my_function(): + buffer = rmm.DeviceBuffer(size=1024) + return buffer + +my_function() + +# Get statistics +stats = rmm.statistics.get_statistics() +print(f"Peak bytes: {stats.peak_bytes}") + +# View profiler report +print(rmm.statistics.default_profiler_records.report()) +``` + +C++ equivalent: + +```cpp +#include +#include +#include +#include + +int main() { + // Set debug log level + rmm::default_logger().set_level(rapids_logger::level_enum::debug); + + // Build resource stack + auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; + auto stats_mr = rmm::mr::statistics_resource_adaptor{&cuda_mr}; + auto log_mr = rmm::mr::logging_resource_adaptor{&stats_mr, "events.csv"}; + + rmm::mr::set_current_device_resource(&log_mr); + + // Now all logging is active + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view()); + + // Get statistics + auto stats = stats_mr.get_statistics(); + std::cout << "Peak bytes: " << stats.peak_bytes << "\n"; + + return 0; +} +``` + +## Use Cases + +### Debugging OOM Errors + +```python +import rmm + +# Enable detailed logging +base = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="oom_debug.csv") +rmm.mr.set_current_device_resource(log_mr) +rmm.set_logging_level("debug") +rmm.statistics.enable_statistics() + +# Run problematic code +try: + large_buffer = rmm.DeviceBuffer(size=100 * 2**30) # 100 GiB +except MemoryError as e: + stats = rmm.statistics.get_statistics() + print(f"Peak before OOM: {stats.peak_bytes / 2**30:.2f} GiB") + print(f"Check oom_debug.csv for allocation history") + raise +``` + +### Profiling Memory in Data Pipeline + +```python +import rmm + +rmm.statistics.enable_statistics() + +@rmm.statistics.profiler() +def load_data(): + return rmm.DeviceBuffer(size=1000000) + +@rmm.statistics.profiler() +def process_data(buffer): + temp = rmm.DeviceBuffer(size=2000000) + result = rmm.DeviceBuffer(size=500000) + return result + +@rmm.statistics.profiler() +def save_data(buffer): + pass + +# Run pipeline +data = load_data() +result = process_data(data) +save_data(result) + +# Identify memory hotspots +print(rmm.statistics.default_profiler_records.report()) +``` + +### Benchmarking Memory Resources + +```python +import rmm +import time + +def benchmark_allocations(mr_name, mr): + rmm.mr.set_current_device_resource(mr) + + start = time.time() + buffers = [] + for _ in range(1000): + buffers.append(rmm.DeviceBuffer(size=1024)) + end = time.time() + + print(f"{mr_name}: {(end - start) * 1000:.2f} ms for 1000 allocations") + +# Compare resources +benchmark_allocations("CudaMemoryResource", rmm.mr.CudaMemoryResource()) +benchmark_allocations("CudaAsyncMemoryResource", rmm.mr.CudaAsyncMemoryResource()) +pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource(), initial_pool_size=2**20) +benchmark_allocations("PoolMemoryResource", pool) +``` + +## Best Practices + +1. **Use event logging for debugging** - CSV logs help understand allocation patterns +2. **Enable statistics for profiling** - Track memory usage over time +3. **Use profiler for hotspot analysis** - Identify which functions allocate most memory +4. **Set appropriate debug level** - Use `INFO` normally, `DEBUG`/`TRACE` when troubleshooting +5. **Disable logging in production** - Logging has overhead; only enable when needed +6. **Analyze logs with tools** - Use pandas, REPLAY_BENCHMARK, or custom scripts +7. **Combine with NVIDIA tools** - Use [NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) alongside RMM logging for a complete picture diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md new file mode 100644 index 000000000..24a819042 --- /dev/null +++ b/docs/user_guide/managed_memory.md @@ -0,0 +1,331 @@ +# Managed Memory and Prefetching + +CUDA Managed Memory (also called Unified Memory) allows memory to be accessed from both CPU and GPU, with automatic page migration managed by the CUDA driver. RMM provides `ManagedMemoryResource` to leverage this capability. + +## What is Managed Memory? + +Managed memory creates a single address space accessible from both CPU and GPU: + +- Allocations can be accessed using the same pointer from host or device code +- The CUDA driver automatically migrates pages between CPU and GPU as needed +- Enables working with datasets **larger than GPU memory** + +## When to Use Managed Memory + +Managed memory is ideal for: + +1. **Datasets larger than GPU memory**: When your data doesn't fit in VRAM +2. **Prototyping**: Simplifies development by removing explicit memory transfers +3. **CPU-GPU interoperability**: When you need to access the same data from both host and device + +**Important**: Managed memory has performance implications. Always combine with prefetching for production workloads. + +## Basic Usage + +### Python + +```python +import rmm + +# Use managed memory as the default resource +rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) + +# Allocations now use managed memory +buffer = rmm.DeviceBuffer(size=1000000) +``` + +### C++ + +```cpp +#include + +auto managed_mr = rmm::mr::managed_memory_resource{}; +rmm::mr::set_current_device_resource(&managed_mr); + +// Allocations use managed memory +rmm::device_buffer buffer(1000000); +``` + +## Performance Considerations + +### Page Faults and Migration + +When the GPU accesses managed memory that is not resident on the GPU, a **page fault** occurs: + +1. GPU execution pauses +2. The driver migrates the page from CPU to GPU +3. GPU execution resumes + +These page faults can significantly impact performance, especially for: +- First-touch access patterns +- Random memory access +- Large datasets that don't fit in GPU memory + +### The Prefetching Solution + +**Prefetching** explicitly migrates data to the GPU before it's accessed, eliminating page faults. + +## Prefetching Strategies + +There are two main strategies for prefetching: + +### 1. Prefetch on Allocate (Eager Prefetching) + +Automatically prefetch memory to the GPU when it's allocated. This is useful when you know the data will be used on the GPU immediately after allocation. + +**Implementation: Use `PrefetchResourceAdaptor`** + +```python +import rmm + +# Wrap managed memory with prefetch adaptor +base = rmm.mr.ManagedMemoryResource() +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) +rmm.mr.set_current_device_resource(prefetch_mr) + +# Every allocation is automatically prefetched to the GPU +buffer = rmm.DeviceBuffer(size=1000000) +# Buffer is already on the GPU, no page faults on first access +``` + +**With a pool:** + +```python +import rmm + +# Combine managed memory, pool, and prefetching +base = rmm.mr.ManagedMemoryResource() +pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) +rmm.mr.set_current_device_resource(prefetch_mr) +``` + +**When to use:** +- Allocations are immediately used on the GPU +- You want automatic prefetching without code changes + +### 2. Prefetch on Access (Lazy Prefetching) + +Explicitly prefetch data just before it's used in a kernel. This gives finer control and can optimize for specific access patterns. + +**Implementation: Manual prefetch calls** + +```python +import rmm + +rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) + +# Allocate managed memory (not prefetched yet) +buffer = rmm.DeviceBuffer(size=1000000) + +# ... later, just before using on GPU ... +stream = rmm.cuda_stream() +buffer.prefetch(device=0, stream=stream) # Prefetch to device 0 + +# Launch kernel on the same stream +# ... kernel will not page fault ... +``` + +**In C++:** + +```cpp +#include +#include +#include + +auto managed_mr = rmm::mr::managed_memory_resource{}; +rmm::mr::set_current_device_resource(&managed_mr); + +rmm::cuda_stream stream; +rmm::device_buffer buffer(1000000, stream.view()); + +// Prefetch before using +rmm::prefetch(buffer.data(), buffer.size(), + rmm::get_current_cuda_device(), stream.view()); + +// Launch kernel +launch_kernel<<>>(buffer.data()); +``` + +**When to use:** +- You need fine-grained control over when data is prefetched +- Access patterns are complex or dynamic +- You're optimizing for specific workload characteristics + +## Practical Example: PyTorch with Larger-Than-VRAM Models + +Here's how to use managed memory with PyTorch to work with models or data larger than GPU memory: + +```python +import rmm +import torch +from rmm.allocators.torch import rmm_torch_allocator + +# Use managed memory with prefetching +base = rmm.mr.ManagedMemoryResource() +pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30, maximum_pool_size=2**34) +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) +rmm.mr.set_current_device_resource(prefetch_mr) + +# Configure PyTorch to use RMM +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + +# Now you can work with larger-than-VRAM data +# Example: Large tensor that doesn't fit in VRAM +large_tensor = torch.randn(100000, 100000, device='cuda') # ~40 GB + +# Operations will automatically page as needed +result = large_tensor @ large_tensor.T +``` + +**What happens:** +1. RMM allocates managed memory for tensors +2. The prefetch adaptor prefetches to GPU on allocation +3. If memory exceeds GPU capacity, pages migrate between CPU and GPU +4. Performance is better than without prefetching + +## Prefetching Best Practices + +### 1. Prefetch Adaptor Should Be Outermost + +When composing memory resources, always make the prefetch adaptor the outermost layer: + +```python +# Correct: Prefetch is outermost +base = rmm.mr.ManagedMemoryResource() +pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) +stats = rmm.mr.StatisticsResourceAdaptor(pool) +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(stats) # Outermost +rmm.mr.set_current_device_resource(prefetch_mr) + +# Incorrect: Prefetch is not outermost +base = rmm.mr.ManagedMemoryResource() +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) +pool = rmm.mr.PoolMemoryResource(prefetch_mr, initial_pool_size=2**30) # Wrong! +``` + +### 2. Prefetch on the Correct Stream + +When manually prefetching, use the same stream as the subsequent kernel: + +```python +stream = rmm.cuda_stream() + +# Prefetch on stream +buffer.prefetch(device=0, stream=stream) + +# Use on the same stream +with stream: + # ... operations using buffer ... +``` + +### 3. Prefetch Size Considerations + +Prefetching is most effective when: +- The prefetch size is large enough to amortize the migration cost +- Data is used shortly after prefetching +- Access patterns are predictable + +### 4. Profile and Measure + +Always profile to verify that prefetching improves performance: + +```python +import rmm +import time + +# Without prefetching +rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) +buffer = rmm.DeviceBuffer(size=10**9) +start = time.time() +# ... run workload ... +print(f"Without prefetch: {time.time() - start:.2f}s") + +# With prefetching +base = rmm.mr.ManagedMemoryResource() +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) +rmm.mr.set_current_device_resource(prefetch_mr) +buffer = rmm.DeviceBuffer(size=10**9) +start = time.time() +# ... run workload ... +print(f"With prefetch: {time.time() - start:.2f}s") +``` + +Use [NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) to visualize page faults and data migration: + +```bash +nsys profile -o output python your_script.py +``` + +## Managed Memory Limitations + +### 1. Not Stream-Ordered + +`ManagedMemoryResource` uses `cudaMallocManaged`, which is **synchronous**. Allocations block until complete, unlike stream-ordered resources. + +For better performance in multi-stream applications, use `CudaAsyncMemoryResource` instead. + +### 2. Performance Overhead + +Even with prefetching, managed memory has overhead compared to explicit memory management: +- Page fault handling +- Driver page migration +- Potential CPU-GPU transfer latency + +For performance-critical code with data that fits in GPU memory, prefer `CudaAsyncMemoryResource`. + +### 3. PCIe Bandwidth Limitation + +If your workload constantly migrates data between CPU and GPU, you're limited by PCIe bandwidth: +- PCIe Gen3 x16: ~12 GB/s +- PCIe Gen4 x16: ~24 GB/s +- PCIe Gen5 x16: ~48 GB/s + +For such workloads, consider: +- Algorithmic changes to reduce data movement +- Using system memory as a staging area +- Streaming data in smaller chunks + +## Comparison: Prefetch Strategies + +| Strategy | Advantages | Disadvantages | Use Case | +|----------|-----------|---------------|----------| +| **PrefetchResourceAdaptor** | Automatic, no code changes | Prefetches everything, even if not needed | General-purpose, allocate-and-use patterns | +| **Manual prefetch** | Fine-grained control, can optimize specific patterns | Requires code changes | Complex access patterns, performance tuning | +| **No prefetching** | Simple | High page fault overhead | Prototyping only, not for production | + +## Multi-GPU Considerations + +When using managed memory with multiple GPUs: + +```python +import rmm + +# Set up managed memory on each device +for device_id in [0, 1]: + with cuda.Device(device_id): + base = rmm.mr.ManagedMemoryResource() + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) + rmm.mr.set_per_device_resource(device_id, prefetch_mr) + +# Prefetch to specific devices +buffer = rmm.DeviceBuffer(size=1000000) +buffer.prefetch(device=0, stream=stream_0) # Prefetch to GPU 0 +buffer.prefetch(device=1, stream=stream_1) # Prefetch to GPU 1 +``` + +## Summary + +- Managed memory enables larger-than-VRAM workloads and simplifies CPU-GPU interoperability +- Always use prefetching in production to avoid page fault overhead +- Use `PrefetchResourceAdaptor` for automatic, eager prefetching +- Use manual `prefetch()` calls for fine-grained control +- Profile with Nsight Systems to measure page fault overhead +- For best performance with data that fits in VRAM, use `CudaAsyncMemoryResource` instead + +## See Also + +- [Choosing a Memory Resource](choosing_memory_resources.md) - When to use managed memory vs. other resources +- [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding asynchronous allocation semantics +- [NVIDIA Developer Blog: Unified Memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) +- [NVIDIA Developer Blog: Memory Oversubscription](https://developer.nvidia.com/blog/improving-gpu-memory-oversubscription-performance/) diff --git a/docs/user_guide/pool_allocators.md b/docs/user_guide/pool_allocators.md new file mode 100644 index 000000000..714fbe0bc --- /dev/null +++ b/docs/user_guide/pool_allocators.md @@ -0,0 +1,455 @@ +# Pool Memory Allocators + +Pool allocators maintain a "pool" of pre-allocated memory to enable fast suballocation without repeatedly calling the underlying memory allocation API. RMM provides several pool-based memory resources, each with different characteristics and use cases. + +## Why Use Pool Allocators? + +Direct allocation (e.g., `cudaMalloc`) has overhead: +- Requires driver synchronization +- Can be slow for small, frequent allocations +- Forces serialization of allocation requests + +Pool allocators address this by: +- Pre-allocating large blocks of memory +- Suballocating from the pool without driver calls +- Reusing freed memory for new allocations + +## RMM's Pool Allocators + +RMM provides three main pool-like allocators: + +1. **`CudaAsyncMemoryResource`**: Driver-managed pool (recommended default) +2. **`PoolMemoryResource`**: RMM-managed coalescing pool +3. **`ArenaMemoryResource`**: Size-binned arena pool + +## CudaAsyncMemoryResource (Recommended) + +The `CudaAsyncMemoryResource` uses CUDA's driver-managed memory pool via `cudaMallocAsync`. + +**Advantages:** +- Virtual address space management (avoids fragmentation) +- Shared across all applications using the same GPU +- Stream-ordered allocation +- No manual tuning of pool sizes + +**Example:** +```python +import rmm + +rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) +``` + +**When to use:** Default choice for most applications. See [Choosing a Memory Resource](choosing_memory_resources.md) for details. + +## PoolMemoryResource + +The `PoolMemoryResource` wraps an upstream memory resource and maintains a pool using a coalescing best-fit allocator. + +### Configuration + +```python +import rmm + +pool = rmm.mr.PoolMemoryResource( + upstream=rmm.mr.CudaMemoryResource(), # or CudaAsyncMemoryResource + initial_pool_size=2**30, # 1 GiB - initial allocation + maximum_pool_size=2**32 # 4 GiB - max the pool can grow to +) +rmm.mr.set_current_device_resource(pool) +``` + +### Parameters + +- **`upstream`**: The underlying memory resource to allocate from + - Use `CudaAsyncMemoryResource()` for best results + - `CudaMemoryResource()` for basic CUDA memory + - Can be any memory resource (including another pool!) + +- **`initial_pool_size`**: Size of the initial allocation + - Larger values reduce early-stage growth overhead + - Should be based on your typical memory usage + - Use string notation: `"1GiB"`, `"512MiB"`, etc. + - Or use powers of 2: `2**30` (1 GiB) + +- **`maximum_pool_size`**: Maximum size the pool can grow to + - Acts as a limit on total GPU memory usage + - `None` means no limit (pool can grow until GPU memory is exhausted) + - Useful for multi-tenant or multi-process scenarios + +### How It Works + +1. **Initial allocation**: On first use, allocates `initial_pool_size` from upstream +2. **Suballocation**: Subsequent allocations are served from the pool +3. **Growth**: If pool is exhausted, allocates more from upstream +4. **Coalescing**: Adjacent freed blocks are merged to reduce fragmentation +5. **Shrinking**: The pool does **not** automatically return memory to upstream + +### Best Practices + +#### 1. Choose Appropriate Pool Sizes + +**Initial pool size:** +- Profile your application to understand memory usage +- Set initial size to ~80% of typical peak usage +- Too small: frequent growth overhead +- Too large: wastes memory, longer startup + +**Example:** +```python +import rmm + +# For an application that typically uses 2 GiB +pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + initial_pool_size=int(1.6 * 2**30), # 1.6 GiB + maximum_pool_size=int(4 * 2**30) # 4 GiB max +) +rmm.mr.set_current_device_resource(pool) +``` + +#### 2. Prefer Async MR as Upstream + +Wrapping `CudaAsyncMemoryResource` combines benefits: + +```python +# Good: Pool wrapping async MR +pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + initial_pool_size=2**30 +) +``` + +This gives: +- Fast suballocation from RMM pool +- Driver's virtual addressing for fragmentation resistance +- Shared memory pool across libraries + +#### 3. Avoid Double Pooling + +Don't wrap a pool in another pool: + +```python +# Bad: Double pooling +inner_pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), 2**30) +outer_pool = rmm.mr.PoolMemoryResource(inner_pool, 2**30) # Wasteful! +``` + +#### 4. Monitor Fragmentation + +Pool allocators can suffer from fragmentation: + +```python +import rmm + +# Enable statistics to monitor fragmentation +pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource(), 2**30) +stats_mr = rmm.mr.StatisticsResourceAdaptor(pool) +rmm.mr.set_current_device_resource(stats_mr) + +# Run workload +# ... + +# Check statistics +stats = rmm.statistics.get_statistics() +print(f"Peak bytes: {stats.peak_bytes}") +print(f"Current bytes: {stats.current_bytes}") +``` + +If `peak_bytes` is much larger than needed, fragmentation may be occurring. + +### Common Issues + +#### Issue 1: Out of Memory (OOM) Before Max Pool Size + +**Symptom:** OOM errors even though allocated memory is less than `maximum_pool_size` + +**Cause:** Fragmentation. The pool has free memory, but not in contiguous blocks. + +**Solutions:** +1. Use `ArenaMemoryResource` instead (better fragmentation characteristics) +2. Use `CudaAsyncMemoryResource` (virtual addressing prevents fragmentation) +3. Adjust allocation patterns to reduce fragmentation + +#### Issue 2: Pool Doesn't Shrink + +**Symptom:** Memory remains allocated even after deallocations + +**Cause:** By design, pools don't return memory to the upstream resource. + +**Solutions:** +1. Destroy and recreate the pool (not recommended for long-running applications) +2. Set appropriate `maximum_pool_size` to limit growth +3. Use `CudaAsyncMemoryResource` if memory should be returned to the system + +## ArenaMemoryResource + +The `ArenaMemoryResource` divides memory into size-binned arenas to reduce fragmentation. + +### Configuration + +```python +import rmm + +arena = rmm.mr.ArenaMemoryResource( + upstream=rmm.mr.CudaMemoryResource(), + arena_size=2**28, # 256 MiB per arena + dump_log_on_failure=False +) +rmm.mr.set_current_device_resource(arena) +``` + +### How It Works + +1. Allocates memory in fixed-size "arenas" +2. Each arena is divided into size-binned "superblocks" +3. Allocations are served from the appropriate bin +4. Reduces fragmentation by isolating allocation sizes + +### When to Use + +- Applications with diverse allocation sizes +- Long-running services with complex allocation patterns +- When `PoolMemoryResource` suffers from fragmentation + +### Example: Mixed Allocation Sizes + +```python +import rmm + +# Application allocates small (KB), medium (MB), and large (GB) buffers +arena = rmm.mr.ArenaMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + arena_size=2**28 # 256 MiB arenas +) +rmm.mr.set_current_device_resource(arena) + +# Allocations are binned by size +small = rmm.DeviceBuffer(size=1024) # Small bin +medium = rmm.DeviceBuffer(size=1024**2) # Medium bin +large = rmm.DeviceBuffer(size=1024**3) # Large bin +``` + +## BinningMemoryResource + +The `BinningMemoryResource` routes allocations to different memory resources based on size. + +### Configuration + +```python +import rmm + +# Create resources for different size ranges +small_mr = rmm.mr.FixedSizeMemoryResource( + rmm.mr.CudaMemoryResource(), + block_size=256 # 256 bytes +) +large_mr = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), + initial_pool_size=2**30 +) + +# Bin allocations by size +binning_mr = rmm.mr.BinningMemoryResource( + upstream=large_mr, # Default for allocations not in bins +) + +# Add bins: allocations of size <= threshold go to this resource +binning_mr.add_bin(256, small_mr) # <= 256 bytes -> small_mr +binning_mr.add_bin(1024, None) # <= 1 KiB -> upstream (large_mr) +# Anything > 1 KiB goes to upstream (large_mr) + +rmm.mr.set_current_device_resource(binning_mr) +``` + +### How It Works + +Allocations are routed based on size: +``` +Allocation size <= bin1_threshold -> bin1_resource +Allocation size <= bin2_threshold -> bin2_resource +... +Allocation size > largest_threshold -> upstream +``` + +### Best Practices for Binning + +#### 1. Profile Allocation Sizes + +Before configuring bins, understand your allocation patterns: + +```python +import rmm + +# Enable statistics to see allocation sizes +base = rmm.mr.CudaMemoryResource() +stats_mr = rmm.mr.StatisticsResourceAdaptor(base) +rmm.mr.set_current_device_resource(stats_mr) + +# Run workload +# ... + +# Analyze allocation patterns +stats = rmm.statistics.get_statistics() +print(stats) +``` + +#### 2. Optimize for Common Sizes + +Configure bins to match your most common allocation sizes: + +```python +import rmm + +# Based on profiling, we know: +# - Many small allocations (< 1 KiB) +# - Medium allocations (1 KiB - 1 MiB) +# - Large allocations (> 1 MiB) + +# Fixed-size resource for small allocations +small_mr = rmm.mr.FixedSizeMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + block_size=1024 # 1 KiB +) + +# Pool for medium allocations +medium_mr = rmm.mr.PoolMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + initial_pool_size=2**28 # 256 MiB +) + +# Pool for large allocations +large_mr = rmm.mr.PoolMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + initial_pool_size=2**30 # 1 GiB +) + +# Configure binning +binning_mr = rmm.mr.BinningMemoryResource(upstream=large_mr) +binning_mr.add_bin(1024, small_mr) # <= 1 KiB +binning_mr.add_bin(1024**2, medium_mr) # <= 1 MiB +# > 1 MiB goes to large_mr + +rmm.mr.set_current_device_resource(binning_mr) +``` + +#### 3. Consider Using ArenaMemoryResource Instead + +For many use cases, `ArenaMemoryResource` provides similar benefits with simpler configuration: + +```python +# Simpler: Arena handles size-binning automatically +arena = rmm.mr.ArenaMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), + arena_size=2**28 +) +rmm.mr.set_current_device_resource(arena) +``` + +### Example: PyTorch with Binning + +From issue #1958, here's a practical example for PyTorch workloads: + +```python +import rmm +import torch +from rmm.allocators.torch import rmm_torch_allocator + +# Use managed memory as base (for larger-than-VRAM scenarios) +upstream = rmm.mr.ManagedMemoryResource() + +# Create a pool wrapping managed memory +pool = rmm.mr.PoolMemoryResource( + upstream, + initial_pool_size=2**20, # 1 MiB + maximum_pool_size=int(80 * 2**30) # 80 GiB max +) + +# Fixed-size resource for small allocations +fixed_mr = rmm.mr.FixedSizeMemoryResource(pool, block_size=1024) # 1 KiB blocks + +# Binning resource +binning_mr = rmm.mr.BinningMemoryResource(upstream=pool) + +# Add bins for common PyTorch tensor sizes +binning_mr.add_bin(256 * 1024, fixed_mr) # <= 256 KiB +binning_mr.add_bin(512 * 1024, None) # <= 512 KiB -> pool +binning_mr.add_bin(1024 * 1024, None) # <= 1 MiB -> pool +binning_mr.add_bin(2 * 1024 * 1024, None) # <= 2 MiB -> pool +binning_mr.add_bin(4 * 1024 * 1024, None) # <= 4 MiB -> pool +# > 4 MiB goes to pool + +rmm.mr.set_current_device_resource(binning_mr) + +# Configure PyTorch +torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +``` + +**Note:** For production PyTorch workloads, prefer `CudaAsyncMemoryResource` unless you specifically need managed memory for larger-than-VRAM scenarios. + +## Choosing Between Pool Allocators + +| Resource | Best For | Fragmentation Handling | Complexity | +|----------|----------|------------------------|------------| +| **CudaAsyncMemoryResource** | General purpose, multi-stream apps | Excellent (virtual addressing) | Low | +| **PoolMemoryResource** | Simple pooling needs | Fair (coalescing) | Low | +| **ArenaMemoryResource** | Diverse allocation sizes | Good (size binning) | Medium | +| **BinningMemoryResource** | Custom size-based routing | Depends on configuration | High | + +## Debugging Pool Issues + +### Enable Logging + +```python +import rmm + +arena = rmm.mr.ArenaMemoryResource( + rmm.mr.CudaMemoryResource(), + arena_size=2**28, + dump_log_on_failure=True # Log on allocation failure +) +rmm.mr.set_current_device_resource(arena) +``` + +### Track Statistics + +```python +import rmm + +pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource(), 2**30) +stats_mr = rmm.mr.StatisticsResourceAdaptor(pool) +rmm.mr.set_current_device_resource(stats_mr) + +# Run workload +buffer = rmm.DeviceBuffer(size=1000000) + +# Check usage +stats = rmm.statistics.get_statistics() +print(f"Current bytes: {stats.current_bytes:,}") +print(f"Peak bytes: {stats.peak_bytes:,}") +print(f"Total allocations: {stats.total_count}") +``` + +### Profile with Nsight Systems + +```bash +nsys profile -o output python your_script.py +``` + +Look for: +- Allocation frequency and sizes +- Memory usage over time +- Fragmentation indicators + +## Summary + +- **For most cases**: Use `CudaAsyncMemoryResource` (driver-managed pool) +- **For simple pooling**: Use `PoolMemoryResource` wrapping `CudaAsyncMemoryResource` +- **For fragmentation issues**: Try `ArenaMemoryResource` +- **For size-based routing**: Use `BinningMemoryResource` (or `ArenaMemoryResource`) +- **Always profile**: Use statistics and Nsight Systems to understand allocation patterns +- **Set appropriate pool sizes**: Too small causes growth overhead, too large wastes memory + +## See Also + +- [Choosing a Memory Resource](choosing_memory_resources.md) - High-level guidance on selecting resources +- [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding async allocation diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md new file mode 100644 index 000000000..3a38a051c --- /dev/null +++ b/docs/user_guide/stream_ordered_allocation.md @@ -0,0 +1,325 @@ +# Stream-Ordered Memory Allocation + +RMM provides **stream-ordered memory allocation**, which means that memory allocations and deallocations are ordered with respect to operations on a CUDA stream. This is a fundamental concept for achieving optimal performance in asynchronous CUDA applications. + +## What is Stream-Ordered Allocation? + +In stream-ordered allocation: + +1. **Allocations are asynchronous**: Calling `allocate()` schedules the allocation on a stream and returns immediately +2. **Memory is available after stream synchronization**: The allocated memory is guaranteed to be available for use by operations scheduled after the allocation on the same stream +3. **Deallocations are also stream-ordered**: Memory is not actually freed until all prior operations on the stream complete + +This allows memory operations to be interleaved with kernel launches and other CUDA operations without explicit synchronization. + +## Why Stream-Ordered Allocation Matters + +Traditional memory allocation (e.g., `cudaMalloc`) is **synchronous** - it blocks until the allocation completes. This creates bubbles in the execution pipeline where the CPU waits for GPU operations to complete. + +Stream-ordered allocation enables: +- **Overlapping compute and memory operations**: Allocations can be scheduled while kernels are running +- **Reduced synchronization overhead**: No need to synchronize the stream before allocating +- **Better multi-stream performance**: Different streams can allocate independently + +## How It Works + +Consider the following example of allocating memory from a stream-ordered memory resource. + +C++: + +```cpp +#include +#include + +rmm::cuda_stream_view stream; +auto buffer = rmm::device_buffer(1000, stream); +``` + +Python: + +```python +import rmm + +# Allocate on a specific stream +stream = rmm.cuda_stream() +buffer = rmm.DeviceBuffer(size=1000, stream=stream) +``` + +The following happens: + +1. The allocation request is **scheduled** on `stream` +2. The function returns immediately (asynchronous) +3. The memory is **guaranteed to be available** for operations enqueued on `stream` after the allocation +4. You can use `buffer.data()` (the pointer) immediately in subsequent stream operations + +## Key Semantics + +### Safe to Use the Pointer Immediately + +**You can use the returned pointer in stream-ordered operations without synchronization:** + +```python +import rmm +import cupy as cp + +stream = rmm.cuda_stream() + +# Allocate memory on the stream +buffer = rmm.DeviceBuffer(size=1000, stream=stream) + +# Use the pointer immediately in a CuPy operation on the same stream +# This is SAFE - no synchronization needed +with stream: + array = cp.ndarray(shape=(250,), dtype=cp.float32, + memptr=cp.cuda.MemoryPointer( + cp.cuda.UnownedMemory(buffer.ptr, buffer.size, buffer), + 0)) + # Kernel launches on this stream will see the allocated memory + array[:] = 42 +``` + +The allocation is guaranteed to complete before the kernel that uses it, as long as both are on the same stream. + +### Deallocations Are Also Stream-Ordered + +When you deallocate (e.g., a buffer goes out of scope), the deallocation is also stream-ordered: + +```python +import rmm + +stream = rmm.cuda_stream() + +# Allocate +buffer = rmm.DeviceBuffer(size=1000, stream=stream) + +# Schedule some work on the stream +# ... kernels using buffer.ptr ... + +# When buffer is destroyed, deallocation is scheduled on the stream +# The memory won't actually be freed until all prior work completes +buffer = None # triggers deallocation +``` + +This ensures that: +- Memory is not freed while still in use by a kernel +- Deallocations don't block waiting for kernels to complete + +### Stream Synchronization + +To guarantee that an allocation has completed (for example, if you need to access it from the CPU), synchronize the stream: + +```python +import rmm + +stream = rmm.cuda_stream() +buffer = rmm.DeviceBuffer(size=1000, stream=stream) + +# Synchronize to ensure allocation completes +stream.synchronize() + +# Now safe to do CPU operations with buffer.ptr +# (though accessing GPU memory from CPU usually requires managed memory) +``` + +## Memory Resources and Stream Ordering + +### Which Resources Support Stream Ordering? + +- **`CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) +- **`PoolMemoryResource`**: Can be stream-ordered when wrapping a stream-ordered upstream +- **`ArenaMemoryResource`**: Stream-ordered when wrapping a stream-ordered upstream +- **`CudaMemoryResource`**: NOT stream-ordered (synchronous `cudaMalloc`) +- **`ManagedMemoryResource`**: NOT stream-ordered (synchronous `cudaMallocManaged`) + +### Example: Pool Wrapping Async MR + +```python +import rmm + +# Create a pool that maintains stream-ordered semantics +pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaAsyncMemoryResource(), # stream-ordered upstream + initial_pool_size=2**30 +) +rmm.mr.set_current_device_resource(pool) + +# Allocations from this pool are stream-ordered +stream = rmm.cuda_stream() +buffer = rmm.DeviceBuffer(size=1000, stream=stream) +``` + +## Common Patterns + +### Pattern 1: Allocate and Use in Kernel + +```python +import rmm +from numba import cuda + +@cuda.jit +def kernel(data, n): + idx = cuda.grid(1) + if idx < n: + data[idx] = idx * 2 + +stream = rmm.cuda_stream() + +# Allocate +buffer = rmm.DeviceBuffer(size=1000 * 4, stream=stream) # 1000 float32s + +# Use immediately +with stream: + kernel[100, 10](cuda.as_cuda_array(buffer).view('float32'), 1000) + +# Synchronize to wait for kernel +stream.synchronize() +``` + +### Pattern 2: Allocate, Compute, Deallocate, Repeat + +```python +import rmm + +stream = rmm.cuda_stream() + +for i in range(100): + # Allocate + buffer = rmm.DeviceBuffer(size=1000000, stream=stream) + + # Use buffer in computations + # ... launch kernels on stream ... + + # Deallocate (automatic, or explicitly set buffer = None) + buffer = None + +# All allocations and deallocations are stream-ordered +# No need to synchronize between iterations +``` + +### Pattern 3: Multi-Stream Allocation + +```python +import rmm + +# Create multiple streams +streams = [rmm.cuda_stream() for _ in range(4)] + +# Allocate on different streams independently +buffers = [] +for stream in streams: + # Each allocation is independent + buffer = rmm.DeviceBuffer(size=1000000, stream=stream) + buffers.append(buffer) + + # Launch work on this stream + # ... kernels using buffer ... + +# Synchronize all streams +for stream in streams: + stream.synchronize() +``` + +## Performance Implications + +### Benefits + +1. **Reduced CPU-GPU synchronization**: No blocking on allocations +2. **Better pipeline utilization**: Memory operations overlap with compute +3. **Multi-stream scalability**: Streams can allocate independently + +### Pitfalls to Avoid + +1. **Don't mix streams**: Using memory allocated on stream A in operations on stream B requires synchronization: + + ```python + stream_a = rmm.cuda_stream() + stream_b = rmm.cuda_stream() + + # Allocate on stream A + buffer = rmm.DeviceBuffer(size=1000, stream=stream_a) + + # To use on stream B, synchronize stream A first + stream_a.synchronize() + + # Now safe to use on stream B + with stream_b: + # ... operations using buffer ... + ``` + +2. **Don't access from CPU without sync**: Stream-ordered allocations are asynchronous - accessing from CPU requires synchronization: + + ```python + stream = rmm.cuda_stream() + buffer = rmm.DeviceBuffer(size=1000, stream=stream) + + # BAD: May access uninitialized memory + # some_function(buffer.ptr) + + # GOOD: Synchronize first + stream.synchronize() + some_function(buffer.ptr) + ``` + +3. **Resource lifetime**: Ensure buffers live until all stream operations complete: + + ```python + stream = rmm.cuda_stream() + + def allocate_and_use(): + buffer = rmm.DeviceBuffer(size=1000, stream=stream) + # Launch kernel using buffer + kernel[...](buffer.ptr) + # BAD: buffer is deallocated when function returns + # but kernel may still be running! + + allocate_and_use() + stream.synchronize() # May crash - buffer already freed + ``` + + Fix: Keep buffer alive until synchronization: + + ```python + stream = rmm.cuda_stream() + buffer = allocate_and_use() # Return the buffer + stream.synchronize() # Now safe + buffer = None # Explicit cleanup after sync + ``` + +## C++ API + +In C++, stream-ordered allocation is the default for most RMM containers: + +```cpp +#include +#include +#include +#include + +// Set async MR as default +auto async_mr = rmm::mr::cuda_async_memory_resource{}; +rmm::mr::set_current_device_resource(&async_mr); + +// Create a stream +rmm::cuda_stream stream; + +// Allocate stream-ordered memory +rmm::device_buffer buffer(1000, stream.view()); +rmm::device_uvector vec(1000, stream.view()); + +// Use immediately in stream-ordered operations +launch_kernel<<>>(buffer.data(), vec.data()); + +// Synchronize +stream.synchronize(); +``` + +## Summary + +- Stream-ordered allocation enables asynchronous, non-blocking memory operations +- Allocated pointers can be used immediately in subsequent operations on the same stream +- Deallocations are also stream-ordered, preventing use-after-free +- `CudaAsyncMemoryResource` provides the best stream-ordered allocation support +- Always synchronize before accessing memory from the CPU +- Ensure buffer lifetimes extend until all stream operations complete + +For more details on choosing memory resources, see [Choosing a Memory Resource](choosing_memory_resources.md). From 2a1f4b85f16b5ed2fc93145d3578e7278ba031d3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Apr 2026 04:41:56 +0000 Subject: [PATCH 02/24] Fix user guide C++ examples to use modern resource_ref API Update all C++ code examples to use set_current_device_resource_ref() instead of set_current_device_resource(&ptr), pass resource refs by value to adaptor constructors, use get_bytes_counter/get_allocations_counter instead of fictional get_statistics(), add compute-sanitizer UM flags, fix managed_memory multi-GPU example, improve choosing_memory_resources managed memory example with PrefetchResourceAdaptor, and fix incorrect upstream= keyword args in pool_allocators.md. --- docs/user_guide/choosing_memory_resources.md | 8 ++++-- docs/user_guide/installation.md | 3 ++- docs/user_guide/introduction.md | 2 +- docs/user_guide/logging.md | 26 +++++++++++--------- docs/user_guide/managed_memory.md | 22 ++++++++++++----- docs/user_guide/pool_allocators.md | 10 ++++---- docs/user_guide/stream_ordered_allocation.md | 2 +- 7 files changed, 45 insertions(+), 28 deletions(-) diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md index 55dec2064..8e09df07c 100644 --- a/docs/user_guide/choosing_memory_resources.md +++ b/docs/user_guide/choosing_memory_resources.md @@ -163,12 +163,16 @@ The `ManagedMemoryResource` uses CUDA unified memory (via `cudaMallocManaged`), ```python import rmm -rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) +# Always combine managed memory with prefetching for acceptable performance. +# Without prefetching, page faults cause significant overhead, especially +# in multi-stream workloads. +base = rmm.mr.ManagedMemoryResource() +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) +rmm.mr.set_current_device_resource(prefetch_mr) ``` **When to use:** - Datasets larger than available GPU memory -- Prototyping or applications where performance is not critical - Always combine with prefetching strategies (see [Managed Memory guide](managed_memory.md)) ### ArenaMemoryResource diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index 145aa8156..ef5539955 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -149,11 +149,12 @@ Create a test file `test_rmm.cpp`: ```cpp #include #include +#include #include int main() { auto mr = rmm::mr::cuda_memory_resource{}; - rmm::mr::set_current_device_resource(&mr); + rmm::mr::set_current_device_resource_ref(mr); rmm::device_buffer buf(100); std::cout << "Allocated " << buf.size() << " bytes\n"; diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md index 936137620..dfe3580b9 100644 --- a/docs/user_guide/introduction.md +++ b/docs/user_guide/introduction.md @@ -63,7 +63,7 @@ All containers use stream-ordered allocation and work with any memory resource. // Use CUDA async memory pool auto async_mr = rmm::mr::cuda_async_memory_resource{}; -rmm::mr::set_current_device_resource(&async_mr); +rmm::mr::set_current_device_resource_ref(async_mr); // Allocate device memory asynchronously rmm::cuda_stream stream; diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md index e097f34cb..f612a2f6d 100644 --- a/docs/user_guide/logging.md +++ b/docs/user_guide/logging.md @@ -49,10 +49,10 @@ int main() { auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; // Wrap with logging adaptor - auto log_mr = rmm::mr::logging_resource_adaptor{&cuda_mr, "memory_log.csv"}; + auto log_mr = rmm::mr::logging_resource_adaptor{cuda_mr, "memory_log.csv"}; // Set as current resource - rmm::mr::set_current_device_resource(&log_mr); + rmm::mr::set_current_device_resource_ref(log_mr); // All allocations logged to CSV rmm::cuda_stream stream; @@ -177,8 +177,8 @@ class Statistics: int main() { auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; - auto stats_mr = rmm::mr::statistics_resource_adaptor{&cuda_mr}; - rmm::mr::set_current_device_resource(&stats_mr); + auto stats_mr = rmm::mr::statistics_resource_adaptor{cuda_mr}; + rmm::mr::set_current_device_resource_ref(stats_mr); // Allocate rmm::cuda_stream stream; @@ -186,9 +186,11 @@ int main() { rmm::device_buffer buffer2(2048, stream.view()); // Get statistics - auto stats = stats_mr.get_statistics(); - std::cout << "Allocated bytes: " << stats.allocated_bytes << "\n"; - std::cout << "Allocation count: " << stats.num_allocations << "\n"; + auto bytes = stats_mr.get_bytes_counter(); + auto allocs = stats_mr.get_allocations_counter(); + std::cout << "Current bytes: " << bytes.value << "\n"; + std::cout << "Peak bytes: " << bytes.peak << "\n"; + std::cout << "Allocation count: " << allocs.value << "\n"; return 0; } @@ -469,18 +471,18 @@ int main() { // Build resource stack auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; - auto stats_mr = rmm::mr::statistics_resource_adaptor{&cuda_mr}; - auto log_mr = rmm::mr::logging_resource_adaptor{&stats_mr, "events.csv"}; + auto stats_mr = rmm::mr::statistics_resource_adaptor{cuda_mr}; + auto log_mr = rmm::mr::logging_resource_adaptor{stats_mr, "events.csv"}; - rmm::mr::set_current_device_resource(&log_mr); + rmm::mr::set_current_device_resource_ref(log_mr); // Now all logging is active rmm::cuda_stream stream; rmm::device_buffer buffer(1024, stream.view()); // Get statistics - auto stats = stats_mr.get_statistics(); - std::cout << "Peak bytes: " << stats.peak_bytes << "\n"; + auto bytes = stats_mr.get_bytes_counter(); + std::cout << "Peak bytes: " << bytes.peak << "\n"; return 0; } diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md index 24a819042..a33265fa6 100644 --- a/docs/user_guide/managed_memory.md +++ b/docs/user_guide/managed_memory.md @@ -40,7 +40,7 @@ buffer = rmm.DeviceBuffer(size=1000000) #include auto managed_mr = rmm::mr::managed_memory_resource{}; -rmm::mr::set_current_device_resource(&managed_mr); +rmm::mr::set_current_device_resource_ref(managed_mr); // Allocations use managed memory rmm::device_buffer buffer(1000000); @@ -134,7 +134,7 @@ buffer.prefetch(device=0, stream=stream) # Prefetch to device 0 #include auto managed_mr = rmm::mr::managed_memory_resource{}; -rmm::mr::set_current_device_resource(&managed_mr); +rmm::mr::set_current_device_resource_ref(managed_mr); rmm::cuda_stream stream; rmm::device_buffer buffer(1000000, stream.view()); @@ -257,6 +257,15 @@ Use [NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) to v nsys profile -o output python your_script.py ``` +When using `compute-sanitizer` with managed memory, you may need to enable page fault tracking: + +```bash +compute-sanitizer --tool memcheck \ + --cuda-um-cpu-page-faults=true \ + --cuda-um-gpu-page-faults=true \ + python your_script.py +``` + ## Managed Memory Limitations ### 1. Not Stream-Ordered @@ -300,13 +309,14 @@ When using managed memory with multiple GPUs: ```python import rmm +from cuda.bindings import runtime as cudart # Set up managed memory on each device for device_id in [0, 1]: - with cuda.Device(device_id): - base = rmm.mr.ManagedMemoryResource() - prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) - rmm.mr.set_per_device_resource(device_id, prefetch_mr) + cudart.cudaSetDevice(device_id) + base = rmm.mr.ManagedMemoryResource() + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) + rmm.mr.set_per_device_resource(device_id, prefetch_mr) # Prefetch to specific devices buffer = rmm.DeviceBuffer(size=1000000) diff --git a/docs/user_guide/pool_allocators.md b/docs/user_guide/pool_allocators.md index 714fbe0bc..9aa168029 100644 --- a/docs/user_guide/pool_allocators.md +++ b/docs/user_guide/pool_allocators.md @@ -51,7 +51,7 @@ The `PoolMemoryResource` wraps an upstream memory resource and maintains a pool import rmm pool = rmm.mr.PoolMemoryResource( - upstream=rmm.mr.CudaMemoryResource(), # or CudaAsyncMemoryResource + rmm.mr.CudaMemoryResource(), # or CudaAsyncMemoryResource initial_pool_size=2**30, # 1 GiB - initial allocation maximum_pool_size=2**32 # 4 GiB - max the pool can grow to ) @@ -191,7 +191,7 @@ The `ArenaMemoryResource` divides memory into size-binned arenas to reduce fragm import rmm arena = rmm.mr.ArenaMemoryResource( - upstream=rmm.mr.CudaMemoryResource(), + rmm.mr.CudaMemoryResource(), arena_size=2**28, # 256 MiB per arena dump_log_on_failure=False ) @@ -250,7 +250,7 @@ large_mr = rmm.mr.PoolMemoryResource( # Bin allocations by size binning_mr = rmm.mr.BinningMemoryResource( - upstream=large_mr, # Default for allocations not in bins + large_mr, # Default for allocations not in bins ) # Add bins: allocations of size <= threshold go to this resource @@ -324,7 +324,7 @@ large_mr = rmm.mr.PoolMemoryResource( ) # Configure binning -binning_mr = rmm.mr.BinningMemoryResource(upstream=large_mr) +binning_mr = rmm.mr.BinningMemoryResource(large_mr) binning_mr.add_bin(1024, small_mr) # <= 1 KiB binning_mr.add_bin(1024**2, medium_mr) # <= 1 MiB # > 1 MiB goes to large_mr @@ -368,7 +368,7 @@ pool = rmm.mr.PoolMemoryResource( fixed_mr = rmm.mr.FixedSizeMemoryResource(pool, block_size=1024) # 1 KiB blocks # Binning resource -binning_mr = rmm.mr.BinningMemoryResource(upstream=pool) +binning_mr = rmm.mr.BinningMemoryResource(pool) # Add bins for common PyTorch tensor sizes binning_mr.add_bin(256 * 1024, fixed_mr) # <= 256 KiB diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md index 3a38a051c..35d84ee16 100644 --- a/docs/user_guide/stream_ordered_allocation.md +++ b/docs/user_guide/stream_ordered_allocation.md @@ -297,7 +297,7 @@ In C++, stream-ordered allocation is the default for most RMM containers: // Set async MR as default auto async_mr = rmm::mr::cuda_async_memory_resource{}; -rmm::mr::set_current_device_resource(&async_mr); +rmm::mr::set_current_device_resource_ref(async_mr); // Create a stream rmm::cuda_stream stream; From 0466c810fd0ac38a311d19b439486697b15ccf29 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 2 Apr 2026 05:09:54 +0000 Subject: [PATCH 03/24] Remove autodoc section for deleted rmm.pylibrmm.cuda_stream module --- docs/python/pylibrmm.md | 9 --------- 1 file changed, 9 deletions(-) diff --git a/docs/python/pylibrmm.md b/docs/python/pylibrmm.md index 216680fb1..0400ac077 100644 --- a/docs/python/pylibrmm.md +++ b/docs/python/pylibrmm.md @@ -23,12 +23,3 @@ The stream classes are available only through `rmm.pylibrmm` and provide low-lev :undoc-members: :show-inheritance: ``` - -### rmm.pylibrmm.cuda_stream - -```{eval-rst} -.. automodule:: rmm.pylibrmm.cuda_stream - :members: - :undoc-members: - :show-inheritance: -``` From 5a2bbaa5752d75bed3de8d6e2b44056930f10bb2 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 14:40:27 +0000 Subject: [PATCH 04/24] Align user guide with 26.06 migration: compiled lib, stream args, value-type resources --- docs/user_guide/guide.md | 9 ++++----- docs/user_guide/installation.md | 4 ++-- docs/user_guide/managed_memory.md | 4 +++- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index 6ab79fdc2..3cd9d149d 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -475,24 +475,23 @@ tensor = torch.zeros(1000, device='cuda') #include #include #include -#include #include int num_devices; cudaGetDeviceCount(&num_devices); -// Store resources to maintain lifetime -std::vector> resources; +// Store resources to maintain lifetime (resources are copyable value types) +std::vector resources; for (int i = 0; i < num_devices; ++i) { // Set device BEFORE creating resource cudaSetDevice(i); // Create resource for this device - resources.push_back(std::make_unique()); + resources.emplace_back(); // Set as per-device resource ref - rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{i}, *resources.back()); + rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{i}, resources.back()); } // Use device 0 diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index ef5539955..1b1e4236c 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -112,7 +112,7 @@ To use RMM in your own CMake project, add the following to your `CMakeLists.txt` ```cmake find_package(rmm REQUIRED) -# Link your target with RMM (header-only, pulls in dependencies) +# Link your target with RMM target_link_libraries(your_target PRIVATE rmm::rmm) ``` @@ -156,7 +156,7 @@ int main() { auto mr = rmm::mr::cuda_memory_resource{}; rmm::mr::set_current_device_resource_ref(mr); - rmm::device_buffer buf(100); + rmm::device_buffer buf(100, rmm::cuda_stream_view{}); std::cout << "Allocated " << buf.size() << " bytes\n"; return 0; diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md index a33265fa6..5d4c7529e 100644 --- a/docs/user_guide/managed_memory.md +++ b/docs/user_guide/managed_memory.md @@ -38,12 +38,14 @@ buffer = rmm.DeviceBuffer(size=1000000) ```cpp #include +#include auto managed_mr = rmm::mr::managed_memory_resource{}; rmm::mr::set_current_device_resource_ref(managed_mr); // Allocations use managed memory -rmm::device_buffer buffer(1000000); +rmm::cuda_stream stream; +rmm::device_buffer buffer(1000000, stream.view()); ``` ## Performance Considerations From b8d0e98c7f601189316580ff539dbef2f45e9d09 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 19:40:25 +0000 Subject: [PATCH 05/24] Revise choosing_memory_resources page for performance-first framing Lead with performance rationale, link Unified Memory and oversubscription to CUDA Programming Guide, trim CudaMemoryResource to minimal description, remove redundant Performance Considerations section, add logging adaptor to compositions and best practices. --- docs/user_guide/choosing_memory_resources.md | 141 +++++++------------ 1 file changed, 50 insertions(+), 91 deletions(-) diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md index 8e09df07c..56afa67fc 100644 --- a/docs/user_guide/choosing_memory_resources.md +++ b/docs/user_guide/choosing_memory_resources.md @@ -2,11 +2,11 @@ One of the most common questions when using RMM is: "Which memory resource should I use?" -This guide provides recommendations for selecting the appropriate memory resource based on your application's needs. +This guide recommends memory resources based on optimal allocation performance for common workloads. ## Recommended Defaults -For most applications, use the CUDA async memory pool. +For most applications, the CUDA async memory pool provides the best allocation performance with no tuning required. `````{tabs} ````{code-tab} c++ @@ -24,7 +24,7 @@ rmm.mr.set_current_device_resource(mr) ```` ````` -For applications exceeding GPU memory limits, use a pooled managed memory resource with prefetching. Note: managed memory is not supported on WSL2 systems. +For applications that require GPU memory oversubscription (allocating more memory than physically available on the GPU), use a pooled managed memory resource with prefetching. This uses [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) (`cudaMallocManaged`) to enable automatic page migration between CPU and GPU at the cost of slower allocation performance. Coupling the managed memory "base" allocator with adaptors for pool allocation and prefetching to device on allocation recovers some of the performance lost to the overhead of managed allocations. Note: Managed memory has [limited support on WSL2](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#unified-memory-on-windows-wsl-and-tegra). `````{tabs} ````{code-tab} c++ @@ -62,17 +62,17 @@ rmm.mr.set_current_device_resource(mr) ## Memory Resource Considerations -It is usually best to use resources that allow the CUDA driver to manage pool suballocation via `cudaMallocFromPoolAsync`. +Resources that use the CUDA driver's pool suballocation (`cudaMallocFromPoolAsync`) provide the best performance because the driver can manage virtual address space efficiently, avoid fragmentation, and share memory across libraries without synchronization overhead. ### CudaAsyncMemoryResource The `CudaAsyncMemoryResource` uses CUDA's driver-managed memory pool (via `cudaMallocAsync`). This is the **recommended default** for most applications. **Advantages:** -- **Driver-managed pool**: Uses efficient suballocation with virtual addressing to avoid fragmentation -- **Cross-library sharing**: The pool can be shared across multiple applications and libraries, even those not using RMM directly -- **Stream-ordered semantics**: Allocations and deallocations are stream-ordered by default -- **Performance**: Similar or better performance compared to RMM's pool implementations +- **Fastest allocation performance**: Driver-managed suballocation with virtual addressing eliminates fragmentation and minimizes latency +- **Cross-library sharing**: The pool is shared across all libraries on the device, even those not using RMM directly +- **Stream-ordered semantics**: Allocations and deallocations are stream-ordered by default, avoiding pipeline stalls in multi-stream workloads +- **Zero configuration**: No pool sizes to tune — the driver manages growth automatically **When to use:** - Default choice for GPU-accelerated applications @@ -82,98 +82,73 @@ The `CudaAsyncMemoryResource` uses CUDA's driver-managed memory pool (via `cudaM ### CudaMemoryResource -The `CudaMemoryResource` uses `cudaMalloc` directly for each allocation, with no pooling. - -**Advantages:** -- Simple and predictable -- No fragmentation concerns -- Memory is immediately returned to the system on deallocation - -**Disadvantages:** -- Slower than pooled allocators due to synchronization overhead - -**Example:** -```python -import rmm - -rmm.mr.set_current_device_resource(rmm.mr.CudaMemoryResource()) -``` +The `CudaMemoryResource` uses the legacy `cudaMalloc`/`cudaFree` APIs directly with no pooling or stream-ordering support. It is generally not recommended. **When to use:** -- Simple applications with infrequent allocations -- Debugging memory issues -- Testing or benchmarking baseline performance +- Debugging memory issues (to isolate allocator-related problems) +- Benchmarking baseline allocation overhead ### PoolMemoryResource -The `PoolMemoryResource` maintains a pool of memory allocated from an upstream resource. +The `PoolMemoryResource` maintains a pool of memory allocated from an upstream resource. It provides fast suballocation but requires manual tuning for pool sizes and does not match the performance of `CudaAsyncMemoryResource` in multi-stream workloads. **Advantages:** - Fast suballocation from pre-allocated pool -- Configurable initial and maximum pool sizes +- Configurable initial and maximum pool sizes for explicit memory budgeting **Disadvantages:** -- Can suffer from fragmentation (unlike async MR) -- Pool is not shared across applications -- Requires careful tuning of pool sizes +- **Slower than async MR** in multi-stream workloads due to internal locking +- Can suffer from fragmentation (async MR reduces this with virtual addressing) +- Pool cannot be shared across CUDA applications unless all applications are using RMM +- May require tuning of pool size for optimal performance + +**When to use:** +- Explicit memory budgeting with fixed pool sizes +- Wrapping non-CUDA memory sources (e.g., managed memory) +- Prefer `CudaAsyncMemoryResource` for new code unless you need explicit pool size control + +**Note**: If using `PoolMemoryResource`, prefer wrapping `CudaAsyncMemoryResource` as the upstream rather than `CudaMemoryResource`: **Example:** ```python import rmm pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), # upstream resource - initial_pool_size=2**30, # 1 GiB - maximum_pool_size=2**32 # 4 GiB + rmm.mr.CudaAsyncMemoryResource(), # upstream resource + initial_pool_size=2**32, # 4 GiB + maximum_pool_size=2**34 # 16 GiB ) rmm.mr.set_current_device_resource(pool) ``` -**When to use:** -- Legacy applications (prefer `CudaAsyncMemoryResource` for new code) -- Specific tuning requirements not met by async MR -- Wrapping non-CUDA memory sources - -**Important**: If using `PoolMemoryResource`, prefer wrapping `CudaAsyncMemoryResource` as the upstream rather than `CudaMemoryResource`: - -```python -# Better: Pool wrapping async MR -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - initial_pool_size=2**30 -) -``` - -This combines the benefits of both: fast suballocation from RMM's pool and the driver's virtual addressing capabilities. - ### ManagedMemoryResource -The `ManagedMemoryResource` uses CUDA unified memory (via `cudaMallocManaged`), allowing memory to be accessible from both CPU and GPU. +The `ManagedMemoryResource` allocates [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) via `cudaMallocManaged`. Unified Memory creates a single address space accessible from both CPU and GPU, with the CUDA driver migrating pages between processors on demand. This enables [GPU memory oversubscription](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) — allocating more memory than physically available on the GPU — but generally comes with a performance cost. **Advantages:** -- Enables working with datasets larger than GPU memory +- Enables GPU memory oversubscription for datasets larger than GPU memory - Automatic page migration between CPU and GPU -- Simplifies memory management for host/device code **Disadvantages:** -- Performance overhead due to page faults and migration -- Requires careful prefetching for optimal performance +- **Slower than device memory** due to page faults and migration overhead, especially in multi-stream workloads (see [Performance Tuning](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#performance-tuning) in the CUDA Programming Guide) +- Requires prefetching to achieve acceptable performance (see [Managed Memory guide](managed_memory.md)) **Example:** ```python import rmm -# Always combine managed memory with prefetching for acceptable performance. -# Without prefetching, page faults cause significant overhead, especially -# in multi-stream workloads. +# Always combine managed memory with a pool and prefetching for acceptable +# performance. Without prefetching, page faults cause significant overhead, +# especially in multi-stream workloads. base = rmm.mr.ManagedMemoryResource() -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) +pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) rmm.mr.set_current_device_resource(prefetch_mr) ``` **When to use:** - Datasets larger than available GPU memory -- Always combine with prefetching strategies (see [Managed Memory guide](managed_memory.md)) +- Always combine with a pool and prefetching (see [Managed Memory guide](managed_memory.md)) ### ArenaMemoryResource @@ -230,12 +205,22 @@ rmm.mr.set_current_device_resource(prefetch) ```python import rmm -# Track allocation statistics +# Track allocation statistics (counts, peak, and total bytes) base = rmm.mr.CudaAsyncMemoryResource() stats = rmm.mr.StatisticsResourceAdaptor(base) rmm.mr.set_current_device_resource(stats) ``` +**Allocation logging:** +```python +import rmm + +# Log every allocation and deallocation to a file +base = rmm.mr.CudaAsyncMemoryResource() +logged = rmm.mr.LoggingResourceAdaptor(base, log_file_name="allocations.csv") +rmm.mr.set_current_device_resource(logged) +``` + ## Multi-Library Applications When using RMM with multiple GPU libraries (e.g., cuDF, PyTorch, CuPy), `CudaAsyncMemoryResource` is especially important because: @@ -259,31 +244,9 @@ torch.cuda.memory.change_current_allocator(rmm_torch_allocator) With this setup, both PyTorch and any other RMM-using code (like cuDF) will share the same driver-managed pool. -## Performance Considerations - -### Async MR vs. Pool MR - -In most cases, `CudaAsyncMemoryResource` provides similar or better performance than `PoolMemoryResource`: - -- Both use pooling for fast suballocation -- Async MR uses virtual addressing to avoid fragmentation -- Async MR shares memory across applications - -**When Pool MR might be faster:** -- Very specific allocation patterns that align well with pool design -- Custom upstream resources (not CUDA memory) - -### Multi-stream Applications - -For applications using multiple CUDA streams or threads: - -- `CudaAsyncMemoryResource` is **strongly recommended** -- Pool allocators can create "pipeline bubbles" where streams wait for allocations -- The async MR handles stream synchronization efficiently - ## Best Practices -1. **Set the memory resource before any allocations**: Once memory is allocated, changing the resource can lead to crashes +1. **Set the memory resource before any allocations**: Changing the resource after allocations have been made can lead to crashes. ```python import rmm @@ -292,11 +255,7 @@ For applications using multiple CUDA streams or threads: rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) ``` -2. **Prefer async MR by default**: Unless you have specific requirements, start with `CudaAsyncMemoryResource` - -3. **Use statistics for tuning**: If you need to understand allocation patterns, wrap with `StatisticsResourceAdaptor` - -4. **Don't over-engineer**: Start simple, profile, and optimize only if needed +2. **Use adaptors for diagnostics**: Wrap with `StatisticsResourceAdaptor` to track allocation counts and peak usage, or `LoggingResourceAdaptor` to log every allocation and deallocation (see [Logging and Profiling](logging.md)). ## See Also From c00a8c863402e162def5fe0f506350644450deef Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 20:31:30 +0000 Subject: [PATCH 06/24] Replace incomplete Available Resources table with links to choosing guide and API reference --- docs/user_guide/guide.md | 51 +--------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index 3cd9d149d..4ea2def07 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -75,56 +75,7 @@ rmm.mr.set_current_device_resource(async_mr) ### Available Resources -RMM provides several memory resource implementations: - -| Resource | Description | Use Case | -|----------|-------------|----------| -| `CudaAsyncMemoryResource` | Uses `cudaMallocAsync` (driver-managed pool) | **Recommended default** | -| `CudaMemoryResource` | Uses `cudaMalloc`/`cudaFree` | Simple, no pooling | -| `ManagedMemoryResource` | Uses `cudaMallocManaged` (unified memory) | Datasets larger than GPU memory | -| `PoolMemoryResource` | Coalescing pool over upstream resource | Custom pool configuration | -| `ArenaMemoryResource` | Size-binned arenas | Mixed allocation sizes | - -`````{tabs} -````{code-tab} c++ -#include -#include -#include -#include - -// CudaMemoryResource - uses cudaMalloc/cudaFree -auto cuda_mr = rmm::mr::cuda_memory_resource{}; - -// CudaAsyncMemoryResource - uses cudaMallocAsync (recommended) -auto async_mr = rmm::mr::cuda_async_memory_resource{}; - -// ManagedMemoryResource - uses cudaMallocManaged -auto managed_mr = rmm::mr::managed_memory_resource{}; - -// PoolMemoryResource - coalescing pool with 1 GiB initial size -rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; -```` -````{code-tab} python -import rmm - -# CudaMemoryResource - uses cudaMalloc/cudaFree -cuda_mr = rmm.mr.CudaMemoryResource() - -# CudaAsyncMemoryResource - uses cudaMallocAsync (recommended) -async_mr = rmm.mr.CudaAsyncMemoryResource() - -# ManagedMemoryResource - uses cudaMallocManaged -managed_mr = rmm.mr.ManagedMemoryResource() - -# PoolMemoryResource - coalescing pool with 1 GiB initial size -pool_mr = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), - initial_pool_size=2**30 # 1 GiB -) -```` -````` - -See [Choosing a Memory Resource](choosing_memory_resources.md) for detailed guidance. +RMM provides base memory resources (e.g., `CudaAsyncMemoryResource`, `ManagedMemoryResource`) and resource adaptors (e.g., `PoolMemoryResource`, `StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the [C++ API Reference](../cpp/memory_resources/index.md) for the full list. ### Per-Device Resources From 91fb555b22902dba54ea06f43a88b05e3a0b657f Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 20:32:27 +0000 Subject: [PATCH 07/24] Add Python API reference link to Available Resources section --- docs/user_guide/guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index 4ea2def07..b6c042d21 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -75,7 +75,7 @@ rmm.mr.set_current_device_resource(async_mr) ### Available Resources -RMM provides base memory resources (e.g., `CudaAsyncMemoryResource`, `ManagedMemoryResource`) and resource adaptors (e.g., `PoolMemoryResource`, `StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the [C++ API Reference](../cpp/memory_resources/index.md) for the full list. +RMM provides base memory resources (e.g., `CudaAsyncMemoryResource`, `ManagedMemoryResource`) and resource adaptors (e.g., `PoolMemoryResource`, `StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the API references ([C++](../cpp/memory_resources/index.md), [Python](../python/index.md)) for the full list. ### Per-Device Resources From 03de6a405a3c8327580d8c0ea0150e9815404f5d Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 20:36:00 +0000 Subject: [PATCH 08/24] Merge Per-Device Resources into Multi-Device Usage, add Python API link --- docs/user_guide/guide.md | 50 ++-------------------------------------- 1 file changed, 2 insertions(+), 48 deletions(-) diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index b6c042d21..b941ebb9b 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -77,34 +77,6 @@ rmm.mr.set_current_device_resource(async_mr) RMM provides base memory resources (e.g., `CudaAsyncMemoryResource`, `ManagedMemoryResource`) and resource adaptors (e.g., `PoolMemoryResource`, `StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the API references ([C++](../cpp/memory_resources/index.md), [Python](../python/index.md)) for the full list. -### Per-Device Resources - -For multi-GPU systems, each device can have its own resource: - -`````{tabs} -````{code-tab} c++ -#include -#include - -// Get per-device resource ref -rmm::device_async_resource_ref mr0 = rmm::mr::get_per_device_resource_ref(rmm::cuda_device_id{0}); - -// Set per-device resource ref -rmm::mr::cuda_async_memory_resource async_mr; -rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{0}, async_mr); -```` -````{code-tab} python -import rmm - -# Get per-device resource -mr0 = rmm.mr.get_per_device_resource(0) - -# Set per-device resource -async_mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_per_device_resource(0, async_mr) -```` -````` - ## Containers RMM provides RAII containers that automatically manage device memory lifetime. @@ -421,6 +393,8 @@ tensor = torch.zeros(1000, device='cuda') ## Multi-Device Usage +For multi-GPU systems, each device can have its own memory resource. Use `set_per_device_resource_ref` (C++) or `set_per_device_resource` (Python) to configure each device before allocating memory on it: + `````{tabs} ````{code-tab} c++ #include @@ -471,23 +445,3 @@ for device_id in range(num_devices): buffer = rmm.DeviceBuffer(size=1024) # Uses device 0's resource ```` ````` - -## Best Practices - -1. **Use `CudaAsyncMemoryResource` by default** - best performance for most workloads - -2. **Set resources before any allocations** - changing resources after allocation can cause crashes - -3. **Maintain resource lifetime** - resources must outlive any allocations from them - -4. **Use RAII containers** - prefer `device_buffer` over raw pointers - -5. **Profile and measure** - use statistics and logging to understand allocation patterns - -## See Also - -- [Choosing a Memory Resource](choosing_memory_resources.md) -- [Stream-Ordered Allocation](stream_ordered_allocation.md) -- [Managed Memory and Prefetching](managed_memory.md) -- [Pool Allocators](pool_allocators.md) -- [Logging and Profiling](logging.md) From 4b7edb6bb81bb8fe81e7605e4c07bfd98a22cd7f Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 21:01:48 +0000 Subject: [PATCH 09/24] Replace hardcoded system requirements with link to RAPIDS Platform Support --- docs/user_guide/installation.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index 1b1e4236c..d4add3abc 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -4,10 +4,7 @@ This guide covers installing RMM. For general RAPIDS installation instructions, ## System Requirements -- **Operating System**: Linux or Windows Subsystem for Linux 2 (WSL2) -- **Python**: 3.10, 3.11, 3.12, or 3.13 -- **CUDA**: 12.2 or later -- **GPU**: Volta architecture or newer (Compute Capability 7.0+) +See [RAPIDS Platform Support](https://docs.rapids.ai/platform-support/) for supported operating systems, CUDA versions, GPU architectures, and Python versions. ## Installing with conda From 6ea60a83a17ba53770bffa92197522e0a671c23b Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 21:11:35 +0000 Subject: [PATCH 10/24] Consolidate duplicate conda environment setup in build-from-source instructions --- docs/user_guide/installation.md | 40 +++++---------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index d4add3abc..f451681eb 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -4,7 +4,7 @@ This guide covers installing RMM. For general RAPIDS installation instructions, ## System Requirements -See [RAPIDS Platform Support](https://docs.rapids.ai/platform-support/) for supported operating systems, CUDA versions, GPU architectures, and Python versions. +See the [RAPIDS Platform Support](https://docs.rapids.ai/platform-support/) for supported operating systems, CUDA versions, GPU architectures, and Python versions for each release. ## Installing with conda @@ -30,7 +30,7 @@ Nightly builds are created from the `main` branch and may contain unreleased fea ## Installing with pip -RMM can also be installed using pip, but requires that CUDA is already installed on your system. +RMM can also be installed using pip. The CUDA driver must already be installed on your system. ```bash pip install rmm-cu13 # For CUDA 13 @@ -42,48 +42,20 @@ pip install rmm-cu12 # For CUDA 12 Building from source gives you the latest features and allows you to customize the build. -### Development Environment +### Clone and Create Development Environment -For a complete development environment, you can create an environment with all dependencies: +The conda environment files in `conda/environments/` pin all build prerequisites (compiler, CUDA toolkit, CMake, etc.) to known-good versions: ```bash -# Clone the repository git clone https://github.com/rapidsai/rmm.git cd rmm # Create environment for CUDA 13 -conda env create --name rmm_env --file conda/environments/all_cuda-130_arch-$(uname -m).yaml - -# Activate the environment -conda activate rmm_env -``` - -### Prerequisites - -- **GCC**: 13 or later -- **nvcc**: CUDA 12.2 or later -- **CMake**: 3.30.4 or later - -### Build Steps - -#### Clone the Repository - -```bash -git clone https://github.com/rapidsai/rmm.git -cd rmm -``` - -#### Create Conda Development Environment - -```bash -# For CUDA 13 -conda env create --name rmm_dev --file conda/environments/all_cuda-130_arch-$(uname -m).yaml - -# Activate the environment +conda env create --name rmm_dev --file conda/environments/all_cuda-131_arch-$(uname -m).yaml conda activate rmm_dev ``` -#### Build Using build.sh +### Build Using build.sh RMM provides a convenience script `build.sh` that handles the build process. The `build.sh` script is meant to be used with the developer conda environment above, which installs all prerequisites. From fc681f1833c7d1e1dc470f3505ad91c2ac3658fb Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 16:12:19 -0500 Subject: [PATCH 11/24] Use 26.06 --- docs/user_guide/installation.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index f451681eb..800872a25 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -100,7 +100,7 @@ include(CPM) CPMAddPackage( NAME rmm - VERSION 26.02 + VERSION 26.06 GITHUB_REPOSITORY rapidsai/rmm GIT_TAG main SOURCE_SUBDIR cpp From d4d8ea6e748b8f44679ac53eccdd37b1a9a8c363 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 21:17:44 +0000 Subject: [PATCH 12/24] Match scope of C++/Python basic examples, add allocation to CuPy integration --- docs/user_guide/introduction.md | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md index dfe3580b9..19cac18f9 100644 --- a/docs/user_guide/introduction.md +++ b/docs/user_guide/introduction.md @@ -75,20 +75,13 @@ stream.synchronize(); ```python import rmm -import cupy as cp -# Create a CUDA async memory resource +# Use CUDA async memory pool mr = rmm.mr.CudaAsyncMemoryResource() - -# Set the current device memory resource rmm.mr.set_current_device_resource(mr) -# Allocating device memory uses the current device resource by default +# Allocate device memory buffer = rmm.DeviceBuffer(size=1024) - -# Use the current device resource with CuPy -cp.cuda.set_allocator(rmm.allocators.cupy.rmm_cupy_allocator) -array = cp.zeros(1000) # Now uses RMM for allocation ``` ## Integration with GPU Libraries @@ -121,6 +114,9 @@ from rmm.allocators.cupy import rmm_cupy_allocator mr = rmm.mr.CudaAsyncMemoryResource() rmm.mr.set_current_device_resource(mr) cupy.cuda.set_allocator(rmm_cupy_allocator) + +# CuPy allocations now use RMM +array = cupy.zeros(1000) ``` ### Numba From 5a5169661c328832497906f96a32e15463455259 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Fri, 3 Apr 2026 21:27:31 +0000 Subject: [PATCH 13/24] Rename base to base_mr in logging.md examples --- docs/user_guide/logging.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md index f612a2f6d..8f77bf034 100644 --- a/docs/user_guide/logging.md +++ b/docs/user_guide/logging.md @@ -18,8 +18,8 @@ Enable logging by wrapping your memory resource with `LoggingResourceAdaptor`: import rmm # Wrap the current resource with logging adaptor -base = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="memory_log.csv") +base_mr = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="memory_log.csv") rmm.mr.set_current_device_resource(log_mr) # Allocations are now logged @@ -431,8 +431,8 @@ Use multiple logging features together: import rmm # Enable memory event logging by wrapping with adaptor -base = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="events.csv") +base_mr = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="events.csv") rmm.mr.set_current_device_resource(log_mr) # Enable statistics and profiling @@ -496,8 +496,8 @@ int main() { import rmm # Enable detailed logging -base = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="oom_debug.csv") +base_mr = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="oom_debug.csv") rmm.mr.set_current_device_resource(log_mr) rmm.set_logging_level("debug") rmm.statistics.enable_statistics() From c44bbbf357680d5ef3181d74226a519ae4fde9b8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sat, 11 Apr 2026 22:53:21 +0000 Subject: [PATCH 14/24] Delete pool_allocators.md, add pool-doesn't-shrink note to choosing guide --- docs/user_guide/choosing_memory_resources.md | 5 +- docs/user_guide/index.md | 1 - docs/user_guide/pool_allocators.md | 455 ------------------- 3 files changed, 2 insertions(+), 459 deletions(-) delete mode 100644 docs/user_guide/pool_allocators.md diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md index 56afa67fc..385601ddf 100644 --- a/docs/user_guide/choosing_memory_resources.md +++ b/docs/user_guide/choosing_memory_resources.md @@ -107,14 +107,14 @@ The `PoolMemoryResource` maintains a pool of memory allocated from an upstream r - Wrapping non-CUDA memory sources (e.g., managed memory) - Prefer `CudaAsyncMemoryResource` for new code unless you need explicit pool size control -**Note**: If using `PoolMemoryResource`, prefer wrapping `CudaAsyncMemoryResource` as the upstream rather than `CudaMemoryResource`: +**Note**: `PoolMemoryResource` does not return memory to the upstream resource on deallocation. Once the pool grows, that memory stays allocated until the resource is destroyed. Set `maximum_pool_size` to limit growth. **Example:** ```python import rmm pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), # upstream resource + rmm.mr.CudaMemoryResource(), initial_pool_size=2**32, # 4 GiB maximum_pool_size=2**34 # 16 GiB ) @@ -259,6 +259,5 @@ With this setup, both PyTorch and any other RMM-using code (like cuDF) will shar ## See Also -- [Pool Allocators](pool_allocators.md) - Detailed guide on pool and arena allocators - [Managed Memory](managed_memory.md) - Guide to using managed memory and prefetching - [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding stream-ordered semantics diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md index bafb4d1e6..801a19d21 100644 --- a/docs/user_guide/index.md +++ b/docs/user_guide/index.md @@ -9,6 +9,5 @@ guide choosing_memory_resources stream_ordered_allocation managed_memory -pool_allocators logging ``` diff --git a/docs/user_guide/pool_allocators.md b/docs/user_guide/pool_allocators.md deleted file mode 100644 index 9aa168029..000000000 --- a/docs/user_guide/pool_allocators.md +++ /dev/null @@ -1,455 +0,0 @@ -# Pool Memory Allocators - -Pool allocators maintain a "pool" of pre-allocated memory to enable fast suballocation without repeatedly calling the underlying memory allocation API. RMM provides several pool-based memory resources, each with different characteristics and use cases. - -## Why Use Pool Allocators? - -Direct allocation (e.g., `cudaMalloc`) has overhead: -- Requires driver synchronization -- Can be slow for small, frequent allocations -- Forces serialization of allocation requests - -Pool allocators address this by: -- Pre-allocating large blocks of memory -- Suballocating from the pool without driver calls -- Reusing freed memory for new allocations - -## RMM's Pool Allocators - -RMM provides three main pool-like allocators: - -1. **`CudaAsyncMemoryResource`**: Driver-managed pool (recommended default) -2. **`PoolMemoryResource`**: RMM-managed coalescing pool -3. **`ArenaMemoryResource`**: Size-binned arena pool - -## CudaAsyncMemoryResource (Recommended) - -The `CudaAsyncMemoryResource` uses CUDA's driver-managed memory pool via `cudaMallocAsync`. - -**Advantages:** -- Virtual address space management (avoids fragmentation) -- Shared across all applications using the same GPU -- Stream-ordered allocation -- No manual tuning of pool sizes - -**Example:** -```python -import rmm - -rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) -``` - -**When to use:** Default choice for most applications. See [Choosing a Memory Resource](choosing_memory_resources.md) for details. - -## PoolMemoryResource - -The `PoolMemoryResource` wraps an upstream memory resource and maintains a pool using a coalescing best-fit allocator. - -### Configuration - -```python -import rmm - -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), # or CudaAsyncMemoryResource - initial_pool_size=2**30, # 1 GiB - initial allocation - maximum_pool_size=2**32 # 4 GiB - max the pool can grow to -) -rmm.mr.set_current_device_resource(pool) -``` - -### Parameters - -- **`upstream`**: The underlying memory resource to allocate from - - Use `CudaAsyncMemoryResource()` for best results - - `CudaMemoryResource()` for basic CUDA memory - - Can be any memory resource (including another pool!) - -- **`initial_pool_size`**: Size of the initial allocation - - Larger values reduce early-stage growth overhead - - Should be based on your typical memory usage - - Use string notation: `"1GiB"`, `"512MiB"`, etc. - - Or use powers of 2: `2**30` (1 GiB) - -- **`maximum_pool_size`**: Maximum size the pool can grow to - - Acts as a limit on total GPU memory usage - - `None` means no limit (pool can grow until GPU memory is exhausted) - - Useful for multi-tenant or multi-process scenarios - -### How It Works - -1. **Initial allocation**: On first use, allocates `initial_pool_size` from upstream -2. **Suballocation**: Subsequent allocations are served from the pool -3. **Growth**: If pool is exhausted, allocates more from upstream -4. **Coalescing**: Adjacent freed blocks are merged to reduce fragmentation -5. **Shrinking**: The pool does **not** automatically return memory to upstream - -### Best Practices - -#### 1. Choose Appropriate Pool Sizes - -**Initial pool size:** -- Profile your application to understand memory usage -- Set initial size to ~80% of typical peak usage -- Too small: frequent growth overhead -- Too large: wastes memory, longer startup - -**Example:** -```python -import rmm - -# For an application that typically uses 2 GiB -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - initial_pool_size=int(1.6 * 2**30), # 1.6 GiB - maximum_pool_size=int(4 * 2**30) # 4 GiB max -) -rmm.mr.set_current_device_resource(pool) -``` - -#### 2. Prefer Async MR as Upstream - -Wrapping `CudaAsyncMemoryResource` combines benefits: - -```python -# Good: Pool wrapping async MR -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - initial_pool_size=2**30 -) -``` - -This gives: -- Fast suballocation from RMM pool -- Driver's virtual addressing for fragmentation resistance -- Shared memory pool across libraries - -#### 3. Avoid Double Pooling - -Don't wrap a pool in another pool: - -```python -# Bad: Double pooling -inner_pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), 2**30) -outer_pool = rmm.mr.PoolMemoryResource(inner_pool, 2**30) # Wasteful! -``` - -#### 4. Monitor Fragmentation - -Pool allocators can suffer from fragmentation: - -```python -import rmm - -# Enable statistics to monitor fragmentation -pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource(), 2**30) -stats_mr = rmm.mr.StatisticsResourceAdaptor(pool) -rmm.mr.set_current_device_resource(stats_mr) - -# Run workload -# ... - -# Check statistics -stats = rmm.statistics.get_statistics() -print(f"Peak bytes: {stats.peak_bytes}") -print(f"Current bytes: {stats.current_bytes}") -``` - -If `peak_bytes` is much larger than needed, fragmentation may be occurring. - -### Common Issues - -#### Issue 1: Out of Memory (OOM) Before Max Pool Size - -**Symptom:** OOM errors even though allocated memory is less than `maximum_pool_size` - -**Cause:** Fragmentation. The pool has free memory, but not in contiguous blocks. - -**Solutions:** -1. Use `ArenaMemoryResource` instead (better fragmentation characteristics) -2. Use `CudaAsyncMemoryResource` (virtual addressing prevents fragmentation) -3. Adjust allocation patterns to reduce fragmentation - -#### Issue 2: Pool Doesn't Shrink - -**Symptom:** Memory remains allocated even after deallocations - -**Cause:** By design, pools don't return memory to the upstream resource. - -**Solutions:** -1. Destroy and recreate the pool (not recommended for long-running applications) -2. Set appropriate `maximum_pool_size` to limit growth -3. Use `CudaAsyncMemoryResource` if memory should be returned to the system - -## ArenaMemoryResource - -The `ArenaMemoryResource` divides memory into size-binned arenas to reduce fragmentation. - -### Configuration - -```python -import rmm - -arena = rmm.mr.ArenaMemoryResource( - rmm.mr.CudaMemoryResource(), - arena_size=2**28, # 256 MiB per arena - dump_log_on_failure=False -) -rmm.mr.set_current_device_resource(arena) -``` - -### How It Works - -1. Allocates memory in fixed-size "arenas" -2. Each arena is divided into size-binned "superblocks" -3. Allocations are served from the appropriate bin -4. Reduces fragmentation by isolating allocation sizes - -### When to Use - -- Applications with diverse allocation sizes -- Long-running services with complex allocation patterns -- When `PoolMemoryResource` suffers from fragmentation - -### Example: Mixed Allocation Sizes - -```python -import rmm - -# Application allocates small (KB), medium (MB), and large (GB) buffers -arena = rmm.mr.ArenaMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - arena_size=2**28 # 256 MiB arenas -) -rmm.mr.set_current_device_resource(arena) - -# Allocations are binned by size -small = rmm.DeviceBuffer(size=1024) # Small bin -medium = rmm.DeviceBuffer(size=1024**2) # Medium bin -large = rmm.DeviceBuffer(size=1024**3) # Large bin -``` - -## BinningMemoryResource - -The `BinningMemoryResource` routes allocations to different memory resources based on size. - -### Configuration - -```python -import rmm - -# Create resources for different size ranges -small_mr = rmm.mr.FixedSizeMemoryResource( - rmm.mr.CudaMemoryResource(), - block_size=256 # 256 bytes -) -large_mr = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), - initial_pool_size=2**30 -) - -# Bin allocations by size -binning_mr = rmm.mr.BinningMemoryResource( - large_mr, # Default for allocations not in bins -) - -# Add bins: allocations of size <= threshold go to this resource -binning_mr.add_bin(256, small_mr) # <= 256 bytes -> small_mr -binning_mr.add_bin(1024, None) # <= 1 KiB -> upstream (large_mr) -# Anything > 1 KiB goes to upstream (large_mr) - -rmm.mr.set_current_device_resource(binning_mr) -``` - -### How It Works - -Allocations are routed based on size: -``` -Allocation size <= bin1_threshold -> bin1_resource -Allocation size <= bin2_threshold -> bin2_resource -... -Allocation size > largest_threshold -> upstream -``` - -### Best Practices for Binning - -#### 1. Profile Allocation Sizes - -Before configuring bins, understand your allocation patterns: - -```python -import rmm - -# Enable statistics to see allocation sizes -base = rmm.mr.CudaMemoryResource() -stats_mr = rmm.mr.StatisticsResourceAdaptor(base) -rmm.mr.set_current_device_resource(stats_mr) - -# Run workload -# ... - -# Analyze allocation patterns -stats = rmm.statistics.get_statistics() -print(stats) -``` - -#### 2. Optimize for Common Sizes - -Configure bins to match your most common allocation sizes: - -```python -import rmm - -# Based on profiling, we know: -# - Many small allocations (< 1 KiB) -# - Medium allocations (1 KiB - 1 MiB) -# - Large allocations (> 1 MiB) - -# Fixed-size resource for small allocations -small_mr = rmm.mr.FixedSizeMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - block_size=1024 # 1 KiB -) - -# Pool for medium allocations -medium_mr = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - initial_pool_size=2**28 # 256 MiB -) - -# Pool for large allocations -large_mr = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - initial_pool_size=2**30 # 1 GiB -) - -# Configure binning -binning_mr = rmm.mr.BinningMemoryResource(large_mr) -binning_mr.add_bin(1024, small_mr) # <= 1 KiB -binning_mr.add_bin(1024**2, medium_mr) # <= 1 MiB -# > 1 MiB goes to large_mr - -rmm.mr.set_current_device_resource(binning_mr) -``` - -#### 3. Consider Using ArenaMemoryResource Instead - -For many use cases, `ArenaMemoryResource` provides similar benefits with simpler configuration: - -```python -# Simpler: Arena handles size-binning automatically -arena = rmm.mr.ArenaMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), - arena_size=2**28 -) -rmm.mr.set_current_device_resource(arena) -``` - -### Example: PyTorch with Binning - -From issue #1958, here's a practical example for PyTorch workloads: - -```python -import rmm -import torch -from rmm.allocators.torch import rmm_torch_allocator - -# Use managed memory as base (for larger-than-VRAM scenarios) -upstream = rmm.mr.ManagedMemoryResource() - -# Create a pool wrapping managed memory -pool = rmm.mr.PoolMemoryResource( - upstream, - initial_pool_size=2**20, # 1 MiB - maximum_pool_size=int(80 * 2**30) # 80 GiB max -) - -# Fixed-size resource for small allocations -fixed_mr = rmm.mr.FixedSizeMemoryResource(pool, block_size=1024) # 1 KiB blocks - -# Binning resource -binning_mr = rmm.mr.BinningMemoryResource(pool) - -# Add bins for common PyTorch tensor sizes -binning_mr.add_bin(256 * 1024, fixed_mr) # <= 256 KiB -binning_mr.add_bin(512 * 1024, None) # <= 512 KiB -> pool -binning_mr.add_bin(1024 * 1024, None) # <= 1 MiB -> pool -binning_mr.add_bin(2 * 1024 * 1024, None) # <= 2 MiB -> pool -binning_mr.add_bin(4 * 1024 * 1024, None) # <= 4 MiB -> pool -# > 4 MiB goes to pool - -rmm.mr.set_current_device_resource(binning_mr) - -# Configure PyTorch -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) -``` - -**Note:** For production PyTorch workloads, prefer `CudaAsyncMemoryResource` unless you specifically need managed memory for larger-than-VRAM scenarios. - -## Choosing Between Pool Allocators - -| Resource | Best For | Fragmentation Handling | Complexity | -|----------|----------|------------------------|------------| -| **CudaAsyncMemoryResource** | General purpose, multi-stream apps | Excellent (virtual addressing) | Low | -| **PoolMemoryResource** | Simple pooling needs | Fair (coalescing) | Low | -| **ArenaMemoryResource** | Diverse allocation sizes | Good (size binning) | Medium | -| **BinningMemoryResource** | Custom size-based routing | Depends on configuration | High | - -## Debugging Pool Issues - -### Enable Logging - -```python -import rmm - -arena = rmm.mr.ArenaMemoryResource( - rmm.mr.CudaMemoryResource(), - arena_size=2**28, - dump_log_on_failure=True # Log on allocation failure -) -rmm.mr.set_current_device_resource(arena) -``` - -### Track Statistics - -```python -import rmm - -pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource(), 2**30) -stats_mr = rmm.mr.StatisticsResourceAdaptor(pool) -rmm.mr.set_current_device_resource(stats_mr) - -# Run workload -buffer = rmm.DeviceBuffer(size=1000000) - -# Check usage -stats = rmm.statistics.get_statistics() -print(f"Current bytes: {stats.current_bytes:,}") -print(f"Peak bytes: {stats.peak_bytes:,}") -print(f"Total allocations: {stats.total_count}") -``` - -### Profile with Nsight Systems - -```bash -nsys profile -o output python your_script.py -``` - -Look for: -- Allocation frequency and sizes -- Memory usage over time -- Fragmentation indicators - -## Summary - -- **For most cases**: Use `CudaAsyncMemoryResource` (driver-managed pool) -- **For simple pooling**: Use `PoolMemoryResource` wrapping `CudaAsyncMemoryResource` -- **For fragmentation issues**: Try `ArenaMemoryResource` -- **For size-based routing**: Use `BinningMemoryResource` (or `ArenaMemoryResource`) -- **Always profile**: Use statistics and Nsight Systems to understand allocation patterns -- **Set appropriate pool sizes**: Too small causes growth overhead, too large wastes memory - -## See Also - -- [Choosing a Memory Resource](choosing_memory_resources.md) - High-level guidance on selecting resources -- [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding async allocation From 5c9afd8fa8b85318cb8e2bebbd4ff8d6804737b5 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 12:55:00 +0000 Subject: [PATCH 15/24] Fix CudaAsyncMR description and multi-library claims in choosing guide Clarify that CudaAsyncMR creates a custom mempool (not the default device mempool), soften performance and configuration claims, remove false cross-library sharing advantage, and rewrite multi-library section to accurately state that each library must be explicitly configured. --- docs/user_guide/choosing_memory_resources.md | 25 ++++++++++---------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md index 385601ddf..f876c6724 100644 --- a/docs/user_guide/choosing_memory_resources.md +++ b/docs/user_guide/choosing_memory_resources.md @@ -62,22 +62,23 @@ rmm.mr.set_current_device_resource(mr) ## Memory Resource Considerations -Resources that use the CUDA driver's pool suballocation (`cudaMallocFromPoolAsync`) provide the best performance because the driver can manage virtual address space efficiently, avoid fragmentation, and share memory across libraries without synchronization overhead. +Resources that use the CUDA driver's pool suballocation (`cudaMallocFromPoolAsync`) provide fast allocation performance because the driver can manage virtual address space efficiently and reduce fragmentation. ### CudaAsyncMemoryResource -The `CudaAsyncMemoryResource` uses CUDA's driver-managed memory pool (via `cudaMallocAsync`). This is the **recommended default** for most applications. +The `CudaAsyncMemoryResource` allocates from a custom CUDA memory pool using `cudaMallocFromPoolAsync`. This is the **recommended default** for most applications. + +Note: This creates a *custom* mempool, not the default device mempool. A custom pool is used to enable features like Blackwell decompression engine support and custom release thresholds. **Advantages:** -- **Fastest allocation performance**: Driver-managed suballocation with virtual addressing eliminates fragmentation and minimizes latency -- **Cross-library sharing**: The pool is shared across all libraries on the device, even those not using RMM directly +- **Fast allocation**: Driver-managed pool reuses previously allocated memory +- **Reduced fragmentation**: Virtual addressing allows non-contiguous physical memory to back contiguous allocations, unlike `PoolMemoryResource` which requires contiguous free regions - **Stream-ordered semantics**: Allocations and deallocations are stream-ordered by default, avoiding pipeline stalls in multi-stream workloads -- **Zero configuration**: No pool sizes to tune — the driver manages growth automatically +- **Low configuration**: The driver manages pool growth automatically, though release threshold and maximum size may need tuning in some environments (e.g., when co-existing with libraries that allocate outside the pool) **When to use:** - Default choice for GPU-accelerated applications - Multi-stream or multi-threaded applications -- Applications using multiple GPU libraries (e.g., cuDF + PyTorch) - Most production workloads ### CudaMemoryResource @@ -223,11 +224,9 @@ rmm.mr.set_current_device_resource(logged) ## Multi-Library Applications -When using RMM with multiple GPU libraries (e.g., cuDF, PyTorch, CuPy), `CudaAsyncMemoryResource` is especially important because: +When using RMM with multiple GPU libraries (e.g., cuDF, PyTorch, CuPy), configuring each library to allocate through RMM ensures all allocations flow through the same resource. This avoids memory partitioning where each library holds its own pool, leaving less memory available for the others. -1. The driver-managed pool is shared automatically across all libraries -2. You don't need to configure every library to use RMM -3. Memory is not artificially partitioned between libraries +Each library must be explicitly configured to use RMM. RMM provides allocator integrations for common libraries: **Example: RMM + PyTorch** ```python @@ -235,14 +234,14 @@ import rmm import torch from rmm.allocators.torch import rmm_torch_allocator -# Use async MR as the base +# Configure RMM rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) -# Configure PyTorch to use RMM +# Configure PyTorch to allocate through RMM torch.cuda.memory.change_current_allocator(rmm_torch_allocator) ``` -With this setup, both PyTorch and any other RMM-using code (like cuDF) will share the same driver-managed pool. +With this setup, both PyTorch and any other RMM-configured library (like cuDF) allocate from the same resource. ## Best Practices From 32aefaa420a4bd62bfc20868afc7fb9daf4461b0 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 17:26:01 +0000 Subject: [PATCH 16/24] Fix Python stream API: use rmm.pylibrmm.stream.Stream, remove context managers --- docs/user_guide/managed_memory.md | 12 ++-- docs/user_guide/stream_ordered_allocation.md | 76 ++++++++++---------- 2 files changed, 46 insertions(+), 42 deletions(-) diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md index 5d4c7529e..a051a6db8 100644 --- a/docs/user_guide/managed_memory.md +++ b/docs/user_guide/managed_memory.md @@ -121,7 +121,9 @@ rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) buffer = rmm.DeviceBuffer(size=1000000) # ... later, just before using on GPU ... -stream = rmm.cuda_stream() +from rmm.pylibrmm.stream import Stream + +stream = Stream() buffer.prefetch(device=0, stream=stream) # Prefetch to device 0 # Launch kernel on the same stream @@ -211,14 +213,14 @@ pool = rmm.mr.PoolMemoryResource(prefetch_mr, initial_pool_size=2**30) # Wrong! When manually prefetching, use the same stream as the subsequent kernel: ```python -stream = rmm.cuda_stream() +from rmm.pylibrmm.stream import Stream + +stream = Stream() # Prefetch on stream buffer.prefetch(device=0, stream=stream) -# Use on the same stream -with stream: - # ... operations using buffer ... +# Launch kernel on the same stream to avoid page faults ``` ### 3. Prefetch Size Considerations diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md index 35d84ee16..083096750 100644 --- a/docs/user_guide/stream_ordered_allocation.md +++ b/docs/user_guide/stream_ordered_allocation.md @@ -41,7 +41,9 @@ Python: import rmm # Allocate on a specific stream -stream = rmm.cuda_stream() +from rmm.pylibrmm.stream import Stream + +stream = Stream() buffer = rmm.DeviceBuffer(size=1000, stream=stream) ``` @@ -60,22 +62,16 @@ The following happens: ```python import rmm -import cupy as cp +from rmm.pylibrmm.stream import Stream -stream = rmm.cuda_stream() +stream = Stream() # Allocate memory on the stream buffer = rmm.DeviceBuffer(size=1000, stream=stream) -# Use the pointer immediately in a CuPy operation on the same stream -# This is SAFE - no synchronization needed -with stream: - array = cp.ndarray(shape=(250,), dtype=cp.float32, - memptr=cp.cuda.MemoryPointer( - cp.cuda.UnownedMemory(buffer.ptr, buffer.size, buffer), - 0)) - # Kernel launches on this stream will see the allocated memory - array[:] = 42 +# The pointer (buffer.ptr) is available immediately and can be passed to +# stream-ordered operations (e.g., kernel launches) on the same stream +# without synchronization. ``` The allocation is guaranteed to complete before the kernel that uses it, as long as both are on the same stream. @@ -86,8 +82,9 @@ When you deallocate (e.g., a buffer goes out of scope), the deallocation is also ```python import rmm +from rmm.pylibrmm.stream import Stream -stream = rmm.cuda_stream() +stream = Stream() # Allocate buffer = rmm.DeviceBuffer(size=1000, stream=stream) @@ -110,8 +107,9 @@ To guarantee that an allocation has completed (for example, if you need to acces ```python import rmm +from rmm.pylibrmm.stream import Stream -stream = rmm.cuda_stream() +stream = Stream() buffer = rmm.DeviceBuffer(size=1000, stream=stream) # Synchronize to ensure allocation completes @@ -131,20 +129,15 @@ stream.synchronize() - **`CudaMemoryResource`**: NOT stream-ordered (synchronous `cudaMalloc`) - **`ManagedMemoryResource`**: NOT stream-ordered (synchronous `cudaMallocManaged`) -### Example: Pool Wrapping Async MR +### Example ```python import rmm +from rmm.pylibrmm.stream import Stream -# Create a pool that maintains stream-ordered semantics -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaAsyncMemoryResource(), # stream-ordered upstream - initial_pool_size=2**30 -) -rmm.mr.set_current_device_resource(pool) +rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) -# Allocations from this pool are stream-ordered -stream = rmm.cuda_stream() +stream = Stream() buffer = rmm.DeviceBuffer(size=1000, stream=stream) ``` @@ -154,6 +147,7 @@ buffer = rmm.DeviceBuffer(size=1000, stream=stream) ```python import rmm +from rmm.pylibrmm.stream import Stream from numba import cuda @cuda.jit @@ -162,14 +156,14 @@ def kernel(data, n): if idx < n: data[idx] = idx * 2 -stream = rmm.cuda_stream() +stream = Stream() # Allocate buffer = rmm.DeviceBuffer(size=1000 * 4, stream=stream) # 1000 float32s -# Use immediately -with stream: - kernel[100, 10](cuda.as_cuda_array(buffer).view('float32'), 1000) +# Launch kernel on the same stream +numba_stream = cuda.external_stream(stream.__cuda_stream__()[1]) +kernel[100, 10, numba_stream](cuda.as_cuda_array(buffer).view('float32'), 1000) # Synchronize to wait for kernel stream.synchronize() @@ -179,8 +173,9 @@ stream.synchronize() ```python import rmm +from rmm.pylibrmm.stream import Stream -stream = rmm.cuda_stream() +stream = Stream() for i in range(100): # Allocate @@ -200,9 +195,10 @@ for i in range(100): ```python import rmm +from rmm.pylibrmm.stream import Stream # Create multiple streams -streams = [rmm.cuda_stream() for _ in range(4)] +streams = [Stream() for _ in range(4)] # Allocate on different streams independently buffers = [] @@ -232,8 +228,10 @@ for stream in streams: 1. **Don't mix streams**: Using memory allocated on stream A in operations on stream B requires synchronization: ```python - stream_a = rmm.cuda_stream() - stream_b = rmm.cuda_stream() + from rmm.pylibrmm.stream import Stream + + stream_a = Stream() + stream_b = Stream() # Allocate on stream A buffer = rmm.DeviceBuffer(size=1000, stream=stream_a) @@ -241,15 +239,15 @@ for stream in streams: # To use on stream B, synchronize stream A first stream_a.synchronize() - # Now safe to use on stream B - with stream_b: - # ... operations using buffer ... + # Now safe to use buffer in operations on stream B ``` 2. **Don't access from CPU without sync**: Stream-ordered allocations are asynchronous - accessing from CPU requires synchronization: ```python - stream = rmm.cuda_stream() + from rmm.pylibrmm.stream import Stream + + stream = Stream() buffer = rmm.DeviceBuffer(size=1000, stream=stream) # BAD: May access uninitialized memory @@ -263,7 +261,9 @@ for stream in streams: 3. **Resource lifetime**: Ensure buffers live until all stream operations complete: ```python - stream = rmm.cuda_stream() + from rmm.pylibrmm.stream import Stream + + stream = Stream() def allocate_and_use(): buffer = rmm.DeviceBuffer(size=1000, stream=stream) @@ -279,7 +279,9 @@ for stream in streams: Fix: Keep buffer alive until synchronization: ```python - stream = rmm.cuda_stream() + from rmm.pylibrmm.stream import Stream + + stream = Stream() buffer = allocate_and_use() # Return the buffer stream.synchronize() # Now safe buffer = None # Explicit cleanup after sync From e97b71348b134edb8514a87c9a74551f9f49c77a Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 18:28:11 +0000 Subject: [PATCH 17/24] Fix stream-ordered allocation factual errors: pointer validity, resource stream-safety, CPU access --- docs/user_guide/stream_ordered_allocation.md | 35 +++++++++----------- 1 file changed, 16 insertions(+), 19 deletions(-) diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md index 083096750..dbcb8a43a 100644 --- a/docs/user_guide/stream_ordered_allocation.md +++ b/docs/user_guide/stream_ordered_allocation.md @@ -6,8 +6,8 @@ RMM provides **stream-ordered memory allocation**, which means that memory alloc In stream-ordered allocation: -1. **Allocations are asynchronous**: Calling `allocate()` schedules the allocation on a stream and returns immediately -2. **Memory is available after stream synchronization**: The allocated memory is guaranteed to be available for use by operations scheduled after the allocation on the same stream +1. **Allocations are asynchronous**: Calling `allocate()` schedules the allocation on a stream and returns a pointer immediately +2. **The pointer is usable immediately**: The returned pointer can be stored and used for any operations that are stream-ordered after the allocation (e.g., kernel launches on the same stream, copy operations on the same stream, or operations on another stream that has been synchronized with the allocating stream using CUDA events) 3. **Deallocations are also stream-ordered**: Memory is not actually freed until all prior operations on the stream complete This allows memory operations to be interleaved with kernel launches and other CUDA operations without explicit synchronization. @@ -103,7 +103,7 @@ This ensures that: ### Stream Synchronization -To guarantee that an allocation has completed (for example, if you need to access it from the CPU), synchronize the stream: +The pointer returned by a stream-ordered allocation is available on the CPU immediately — it can be stored, compared, or passed to other API calls without synchronization. Synchronization is only needed before accessing the *contents* of the GPU memory from the CPU: ```python import rmm @@ -112,11 +112,11 @@ from rmm.pylibrmm.stream import Stream stream = Stream() buffer = rmm.DeviceBuffer(size=1000, stream=stream) -# Synchronize to ensure allocation completes -stream.synchronize() +# buffer.ptr is available immediately (no sync needed to use the pointer) +print(f"Pointer: {buffer.ptr}") # OK -# Now safe to do CPU operations with buffer.ptr -# (though accessing GPU memory from CPU usually requires managed memory) +# To read GPU memory contents from the CPU, synchronize first +stream.synchronize() ``` ## Memory Resources and Stream Ordering @@ -124,10 +124,10 @@ stream.synchronize() ### Which Resources Support Stream Ordering? - **`CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) -- **`PoolMemoryResource`**: Can be stream-ordered when wrapping a stream-ordered upstream -- **`ArenaMemoryResource`**: Stream-ordered when wrapping a stream-ordered upstream -- **`CudaMemoryResource`**: NOT stream-ordered (synchronous `cudaMalloc`) -- **`ManagedMemoryResource`**: NOT stream-ordered (synchronous `cudaMallocManaged`) +- **`PoolMemoryResource`**: Internally stream-safe — suballocations are mutex-protected, independent of upstream +- **`ArenaMemoryResource`**: Internally stream-safe — uses per-stream arenas, independent of upstream +- **`CudaMemoryResource`**: NOT stream-ordered (`cudaMalloc` is synchronous) +- **`ManagedMemoryResource`**: NOT stream-ordered (`cudaMallocManaged` is synchronous) ### Example @@ -242,7 +242,7 @@ for stream in streams: # Now safe to use buffer in operations on stream B ``` -2. **Don't access from CPU without sync**: Stream-ordered allocations are asynchronous - accessing from CPU requires synchronization: +2. **Synchronize before reading GPU memory from the CPU**: The pointer is available immediately, but the memory contents are not readable from the CPU until the stream catches up: ```python from rmm.pylibrmm.stream import Stream @@ -250,12 +250,9 @@ for stream in streams: stream = Stream() buffer = rmm.DeviceBuffer(size=1000, stream=stream) - # BAD: May access uninitialized memory - # some_function(buffer.ptr) - - # GOOD: Synchronize first + # buffer.ptr is usable immediately (e.g., pass to a kernel) + # Synchronize the stream before reading memory contents from the CPU. stream.synchronize() - some_function(buffer.ptr) ``` 3. **Resource lifetime**: Ensure buffers live until all stream operations complete: @@ -318,10 +315,10 @@ stream.synchronize(); ## Summary - Stream-ordered allocation enables asynchronous, non-blocking memory operations -- Allocated pointers can be used immediately in subsequent operations on the same stream +- Allocated pointers are available immediately and can be used in operations on the same stream - Deallocations are also stream-ordered, preventing use-after-free - `CudaAsyncMemoryResource` provides the best stream-ordered allocation support -- Always synchronize before accessing memory from the CPU +- Synchronize before reading GPU memory contents from the CPU - Ensure buffer lifetimes extend until all stream operations complete For more details on choosing memory resources, see [Choosing a Memory Resource](choosing_memory_resources.md). From b999401b75545a816c1f7662ef6152b054a24d00 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 19:09:20 +0000 Subject: [PATCH 18/24] Rework guide.md examples to use explicit resource passing --- docs/user_guide/guide.md | 115 +++++++++++++++++++++------------------ 1 file changed, 62 insertions(+), 53 deletions(-) diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index b941ebb9b..dc46a1a3c 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -7,18 +7,16 @@ This guide covers using RMM in C++ and Python applications, including memory res `````{tabs} ````{code-tab} c++ #include -#include #include #include int main() { - // Use async MR (recommended) + // Create a memory resource rmm::mr::cuda_async_memory_resource async_mr; - rmm::mr::set_current_device_resource_ref(async_mr); - // Allocate device memory + // Allocate device memory using the resource rmm::cuda_stream stream; - rmm::device_buffer buffer(1024, stream.view()); + rmm::device_buffer buffer(1024, stream.view(), async_mr); std::cout << "Allocated " << buffer.size() << " bytes\n"; @@ -28,12 +26,11 @@ int main() { ````{code-tab} python import rmm -# Use async MR (recommended) +# Create a memory resource mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) -# Allocate device memory -buffer = rmm.DeviceBuffer(size=1024) +# Allocate device memory using the resource +buffer = rmm.DeviceBuffer(size=1024, mr=mr) print(f"Allocated {buffer.size} bytes at {hex(buffer.ptr)}") ```` @@ -43,31 +40,49 @@ print(f"Allocated {buffer.size} bytes at {hex(buffer.ptr)}") Memory resources control how device memory is allocated. RMM provides several resource types optimized for different use cases. -### Setting the Current Resource +### Explicit Resource Passing + +The preferred way to use a memory resource is to pass it explicitly when allocating memory. This makes it clear which resource handles each allocation: + +`````{tabs} +````{code-tab} c++ +rmm::mr::cuda_async_memory_resource async_mr; +rmm::cuda_stream stream; + +// Pass the resource explicitly +rmm::device_buffer buffer(1024, stream.view(), async_mr); +```` +````{code-tab} python +mr = rmm.mr.CudaAsyncMemoryResource() + +# Pass the resource explicitly +buffer = rmm.DeviceBuffer(size=1024, mr=mr) +```` +````` + +### Setting the Current Device Resource -The current device resource is used by default for all allocations: +RMM also provides a global "current device resource" that is used when no resource is passed explicitly: `````{tabs} ````{code-tab} c++ #include #include -// Get current device resource ref -rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); - -// Set current device resource ref rmm::mr::cuda_async_memory_resource async_mr; rmm::mr::set_current_device_resource_ref(async_mr); + +// Allocations that don't specify a resource use the current device resource +rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); ```` ````{code-tab} python import rmm -# Get current device resource -mr = rmm.mr.get_current_device_resource() - -# Set current device resource async_mr = rmm.mr.CudaAsyncMemoryResource() rmm.mr.set_current_device_resource(async_mr) + +# Allocations that don't specify a resource use the current device resource +mr = rmm.mr.get_current_device_resource() ```` ````` @@ -184,15 +199,13 @@ Adaptors wrap resources to add functionality like statistics tracking and loggin `````{tabs} ````{code-tab} c++ #include -#include rmm::mr::cuda_async_memory_resource cuda_mr; rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; -rmm::mr::set_current_device_resource_ref(stats_mr); -// Allocate +// Allocate using the statistics-wrapped resource rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view()); +rmm::device_buffer buffer(1024, stream.view(), stats_mr); // Get statistics auto bytes = stats_mr.get_bytes_counter(); @@ -206,10 +219,9 @@ import rmm # Wrap base resource with statistics adaptor cuda_mr = rmm.mr.CudaAsyncMemoryResource() stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) -rmm.mr.set_current_device_resource(stats_mr) -# Allocate -buffer = rmm.DeviceBuffer(size=1024) +# Allocate using the statistics-wrapped resource +buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) # Get statistics stats = stats_mr.allocation_counts @@ -224,25 +236,22 @@ print(f"Total bytes: {stats.total_bytes}") `````{tabs} ````{code-tab} c++ #include -#include rmm::mr::cuda_async_memory_resource cuda_mr; rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "allocations.csv"}; -rmm::mr::set_current_device_resource_ref(log_mr); -// All allocations logged to CSV -rmm::device_buffer buffer(1024, rmm::cuda_stream_default); +// Allocations through log_mr are logged to CSV +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view(), log_mr); ```` ````{code-tab} python import rmm -# Wrap the current resource with logging adaptor -base = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="allocations.csv") -rmm.mr.set_current_device_resource(log_mr) +base_mr = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="allocations.csv") -# All allocations logged to CSV -buffer = rmm.DeviceBuffer(size=1024) +# Allocations through log_mr are logged to CSV +buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) ```` ````` @@ -256,14 +265,13 @@ Adaptors can be stacked to combine functionality: `````{tabs} ````{code-tab} c++ -#include +#include #include #include #include -#include // Base resource -rmm::mr::cuda_async_memory_resource cuda_mr; +rmm::mr::cuda_memory_resource cuda_mr; // Add pool rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; @@ -274,14 +282,15 @@ rmm::mr::statistics_resource_adaptor stats_mr{pool_mr}; // Add logging rmm::mr::logging_resource_adaptor log_mr{stats_mr, "log.csv"}; -// Set as current -rmm::mr::set_current_device_resource_ref(log_mr); +// Use log_mr for allocations — all allocations are pooled, tracked, and logged +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view(), log_mr); ```` ````{code-tab} python import rmm # Base resource -cuda_mr = rmm.mr.CudaAsyncMemoryResource() +cuda_mr = rmm.mr.CudaMemoryResource() # Add pool pool_mr = rmm.mr.PoolMemoryResource(cuda_mr, initial_pool_size=2**30) @@ -292,8 +301,8 @@ stats_mr = rmm.mr.StatisticsResourceAdaptor(pool_mr) # Add logging log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") -# Set as current -rmm.mr.set_current_device_resource(log_mr) +# Use log_mr for allocations — all allocations are pooled, tracked, and logged +buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) ```` ````` @@ -303,31 +312,31 @@ Order matters: outer adaptors see all allocations from inner resources. ### Thrust (C++) -Use `rmm::exec_policy` to make Thrust algorithms use RMM for temporary storage: +Use `rmm::exec_policy_nosync` to make Thrust algorithms use RMM for temporary storage. Passing the resource explicitly makes it clear which resource handles temporaries: ```cpp #include +#include #include #include #include +rmm::mr::cuda_async_memory_resource mr; rmm::cuda_stream stream; -rmm::device_uvector vec(1000, stream.view()); +rmm::device_uvector vec(1000, stream.view(), mr); // Fill with descending values -thrust::sequence(rmm::exec_policy(stream.view()), +thrust::sequence(rmm::exec_policy_nosync(stream.view(), mr), vec.begin(), vec.end(), vec.size() - 1, -1); -// Sort using current device resource for temporary storage -thrust::sort(rmm::exec_policy(stream.view()), vec.begin(), vec.end()); - -// Or use a specific memory resource for temporary storage -rmm::mr::cuda_async_memory_resource custom_mr; -thrust::sort(rmm::exec_policy(stream.view(), custom_mr), vec.begin(), vec.end()); +// Sort — temporaries allocated from mr +thrust::sort(rmm::exec_policy_nosync(stream.view(), mr), vec.begin(), vec.end()); stream.synchronize(); ``` +`exec_policy_nosync` allows the Thrust backend to skip stream synchronizations that are not required for correctness, improving performance. Stream-ordered applications using RMM should always prefer `exec_policy_nosync`. If stream synchronizations are required, the application should insert them explicitly before reading device data from the host. + ### CuPy (Python) Configure CuPy to use RMM for all device memory allocations: From f767142fc97f9d71a204cf7695fcc61629553a00 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 19:53:41 +0000 Subject: [PATCH 19/24] Restructure logging.md with tabbed code blocks and explicit resource passing --- docs/user_guide/logging.md | 331 ++++++++++++++----------------------- 1 file changed, 128 insertions(+), 203 deletions(-) diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md index 8f77bf034..287bf55f4 100644 --- a/docs/user_guide/logging.md +++ b/docs/user_guide/logging.md @@ -10,63 +10,39 @@ Memory event logging writes details of every allocation and deallocation to a CS - Profiling memory usage - Replaying workloads for benchmarking -### Python: Using Memory Event Logging +### Using the Logging Adaptor -Enable logging by wrapping your memory resource with `LoggingResourceAdaptor`: +Wrap any memory resource with the logging adaptor to record allocations and deallocations to a CSV file: -```python -import rmm - -# Wrap the current resource with logging adaptor -base_mr = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="memory_log.csv") -rmm.mr.set_current_device_resource(log_mr) - -# Allocations are now logged -buffer1 = rmm.DeviceBuffer(size=1024) -buffer2 = rmm.DeviceBuffer(size=2048) - -# All allocations/deallocations written to memory_log.csv -``` - -If `log_file_name` is not provided, the environment variable `RMM_LOG_FILE` is used: - -```bash -export RMM_LOG_FILE="allocations.csv" -python script.py -``` - -### C++: Using logging_resource_adaptor - -Wrap any memory resource with `logging_resource_adaptor`: - -```cpp +`````{tabs} +````{code-tab} c++ #include #include -int main() { - // Create upstream resource - auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; +rmm::mr::cuda_async_memory_resource cuda_mr; +rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "memory_log.csv"}; - // Wrap with logging adaptor - auto log_mr = rmm::mr::logging_resource_adaptor{cuda_mr, "memory_log.csv"}; - - // Set as current resource - rmm::mr::set_current_device_resource_ref(log_mr); +// Allocations through log_mr are logged to CSV +rmm::cuda_stream stream; +rmm::device_buffer buf1(1024, stream.view(), log_mr); +rmm::device_buffer buf2(2048, stream.view(), log_mr); +```` +````{code-tab} python +import rmm - // All allocations logged to CSV - rmm::cuda_stream stream; - rmm::device_buffer buffer(1024, stream.view()); +base_mr = rmm.mr.CudaAsyncMemoryResource() +log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="memory_log.csv") - return 0; -} -``` +# Allocations through log_mr are logged to CSV +buf1 = rmm.DeviceBuffer(size=1024, mr=log_mr) +buf2 = rmm.DeviceBuffer(size=2048, mr=log_mr) +```` +````` -If filename is not provided, `RMM_LOG_FILE` environment variable is checked: +If no filename is provided, the `RMM_LOG_FILE` environment variable is used: ```bash export RMM_LOG_FILE="allocations.csv" -./my_app ``` ### CSV Log Format @@ -135,9 +111,50 @@ This replays the allocation pattern from the log, useful for: ## Memory Statistics -RMM provides statistics tracking for allocations using `statistics_resource_adaptor`. +RMM provides statistics tracking for allocations using `statistics_resource_adaptor`. The adaptor tracks current, peak, and total allocation bytes and counts. -### Python: Enabling Statistics +### Using the Statistics Adaptor + +`````{tabs} +````{code-tab} c++ +#include +#include +#include + +rmm::mr::cuda_async_memory_resource cuda_mr; +rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + +// Allocate using the statistics-wrapped resource +rmm::cuda_stream stream; +rmm::device_buffer buf1(1024, stream.view(), stats_mr); +rmm::device_buffer buf2(2048, stream.view(), stats_mr); + +// Get statistics +auto bytes = stats_mr.get_bytes_counter(); +auto allocs = stats_mr.get_allocations_counter(); +std::cout << "Current bytes: " << bytes.value << "\n"; +std::cout << "Peak bytes: " << bytes.peak << "\n"; +std::cout << "Allocation count: " << allocs.value << "\n"; +```` +````{code-tab} python +import rmm + +cuda_mr = rmm.mr.CudaAsyncMemoryResource() +stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + +# Allocate using the statistics-wrapped resource +buf1 = rmm.DeviceBuffer(size=1024, mr=stats_mr) +buf2 = rmm.DeviceBuffer(size=2048, mr=stats_mr) + +# Get statistics +stats = stats_mr.allocation_counts +print(f"Current bytes: {stats.current_bytes}") +print(f"Peak bytes: {stats.peak_bytes}") +print(f"Total allocations: {stats.total_count}") +```` +````` + +Python also provides a convenience API for enabling statistics globally: ```python import rmm @@ -149,53 +166,12 @@ rmm.statistics.enable_statistics() with rmm.statistics.statistics(): buffer = rmm.DeviceBuffer(size=1024) - # Get current statistics stats = rmm.statistics.get_statistics() print(f"Current bytes: {stats.current_bytes}") print(f"Peak bytes: {stats.peak_bytes}") print(f"Total allocations: {stats.total_count}") ``` -Available statistics: - -```python -class Statistics: - current_bytes: int # Currently allocated bytes - current_count: int # Number of active allocations - peak_bytes: int # Peak bytes allocated - peak_count: int # Peak number of allocations - total_bytes: int # Total bytes ever allocated - total_count: int # Total number of allocations -``` - -### C++: Using statistics_resource_adaptor - -```cpp -#include -#include -#include - -int main() { - auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; - auto stats_mr = rmm::mr::statistics_resource_adaptor{cuda_mr}; - rmm::mr::set_current_device_resource_ref(stats_mr); - - // Allocate - rmm::cuda_stream stream; - rmm::device_buffer buffer1(1024, stream.view()); - rmm::device_buffer buffer2(2048, stream.view()); - - // Get statistics - auto bytes = stats_mr.get_bytes_counter(); - auto allocs = stats_mr.get_allocations_counter(); - std::cout << "Current bytes: " << bytes.value << "\n"; - std::cout << "Peak bytes: " << bytes.peak << "\n"; - std::cout << "Allocation count: " << allocs.value << "\n"; - - return 0; -} -``` - ### Tracking Memory Growth Monitor memory usage over time: @@ -227,13 +203,11 @@ buffers.extend([rmm.DeviceBuffer(size=2*1024*1024) for _ in range(5)]) checkpoint("After 5x2MB allocations") ``` -## Memory Profiling +## Memory Profiling (Python) The memory profiler tracks allocations by function/code block. -### Python: Using the Profiler - -#### Profiling Functions +### Profiling Functions ```python import rmm @@ -255,23 +229,9 @@ process_data(1000000) print(rmm.statistics.default_profiler_records.report()) ``` -Output: -``` -Memory Profiling -================ - -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) +The report shows the number of calls, peak memory, and total memory for each profiled function. -Ordered by: memory_peak - -ncalls memory_peak memory_total filename:lineno(function) - 1 1,000,016 1,000,016 script.py:5(process_data) -``` - -#### Profiling Code Blocks +### Profiling Code Blocks ```python import rmm @@ -290,14 +250,7 @@ with rmm.statistics.profiler(name="processing"): print(rmm.statistics.default_profiler_records.report()) ``` -Output: -``` -ncalls memory_peak memory_total filename:lineno(function) - 1 1,000,016 1,000,016 data loading - 1 1,000,032 1,000,032 processing -``` - -#### Nested Profiling +### Nested Profiling ```python import rmm @@ -315,12 +268,7 @@ with rmm.statistics.profiler(name="outer"): print(rmm.statistics.default_profiler_records.report()) ``` -Output shows both nested and total allocations: -``` -ncalls memory_peak memory_total filename:lineno(function) - 1 3,520 3,520 outer - 1 2,016 2,016 inner -``` +The report includes entries for both the outer and inner profiling scopes. ### Custom Profiler Records @@ -375,35 +323,23 @@ Available levels (increasing verbosity): - `DEBUG` - Detailed debug info - `TRACE` - Very verbose tracing -#### Runtime Log Level (Python) +#### Runtime Log Level Even with verbose logging compiled in, you must enable it at runtime: -```python -import rmm - -# Enable all logging down to TRACE level -rmm.set_logging_level("trace") - -# Now you'll see TRACE and DEBUG messages -``` - -Available Python levels: `"trace"`, `"debug"`, `"info"`, `"warn"`, `"error"`, `"critical"`, `"off"` - -#### Runtime Log Level (C++) - -```cpp +`````{tabs} +````{code-tab} c++ #include -int main() { - // Enable all logging down to TRACE level - rmm::default_logger().set_level(rapids_logger::level_enum::trace); - - // Your code here +rmm::default_logger().set_level(rapids_logger::level_enum::trace); +```` +````{code-tab} python +import rmm - return 0; -} -``` +# Available levels: "trace", "debug", "info", "warn", "error", "critical", "off" +rmm.set_logging_level("trace") +```` +````` ### What Gets Logged @@ -425,68 +361,60 @@ Example debug output: ## Combining Logging Features -Use multiple logging features together: +Multiple logging features can be composed together by stacking adaptors: -```python -import rmm +`````{tabs} +````{code-tab} c++ +#include +#include +#include +#include -# Enable memory event logging by wrapping with adaptor -base_mr = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="events.csv") -rmm.mr.set_current_device_resource(log_mr) +// Set debug log level +rmm::default_logger().set_level(rapids_logger::level_enum::debug); -# Enable statistics and profiling -rmm.statistics.enable_statistics() +// Build resource stack: statistics + logging +rmm::mr::cuda_async_memory_resource cuda_mr; +rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; +rmm::mr::logging_resource_adaptor log_mr{stats_mr, "events.csv"}; + +// All allocations through log_mr are tracked and logged +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view(), log_mr); + +// Get statistics +auto bytes = stats_mr.get_bytes_counter(); +std::cout << "Peak bytes: " << bytes.peak << "\n"; +```` +````{code-tab} python +import rmm # Set debug log level rmm.set_logging_level("debug") -# Now all logging is active -@rmm.statistics.profiler() -def my_function(): - buffer = rmm.DeviceBuffer(size=1024) - return buffer +# Build resource stack: statistics + logging +cuda_mr = rmm.mr.CudaAsyncMemoryResource() +stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) +log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="events.csv") -my_function() +# All allocations through log_mr are tracked and logged +buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) # Get statistics -stats = rmm.statistics.get_statistics() +stats = stats_mr.allocation_counts print(f"Peak bytes: {stats.peak_bytes}") -# View profiler report -print(rmm.statistics.default_profiler_records.report()) -``` - -C++ equivalent: - -```cpp -#include -#include -#include -#include - -int main() { - // Set debug log level - rmm::default_logger().set_level(rapids_logger::level_enum::debug); - - // Build resource stack - auto cuda_mr = rmm::mr::cuda_async_memory_resource{}; - auto stats_mr = rmm::mr::statistics_resource_adaptor{cuda_mr}; - auto log_mr = rmm::mr::logging_resource_adaptor{stats_mr, "events.csv"}; - - rmm::mr::set_current_device_resource_ref(log_mr); - - // Now all logging is active - rmm::cuda_stream stream; - rmm::device_buffer buffer(1024, stream.view()); +# Profiling can also be used alongside event logging +rmm.statistics.enable_statistics() - // Get statistics - auto bytes = stats_mr.get_bytes_counter(); - std::cout << "Peak bytes: " << bytes.peak << "\n"; +@rmm.statistics.profiler() +def my_function(): + return rmm.DeviceBuffer(size=1024, mr=log_mr) - return 0; -} -``` +my_function() +print(rmm.statistics.default_profiler_records.report()) +```` +````` ## Use Cases @@ -497,16 +425,15 @@ import rmm # Enable detailed logging base_mr = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="oom_debug.csv") -rmm.mr.set_current_device_resource(log_mr) +stats_mr = rmm.mr.StatisticsResourceAdaptor(base_mr) +log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="oom_debug.csv") rmm.set_logging_level("debug") -rmm.statistics.enable_statistics() # Run problematic code try: - large_buffer = rmm.DeviceBuffer(size=100 * 2**30) # 100 GiB + large_buffer = rmm.DeviceBuffer(size=100 * 2**30, mr=log_mr) # 100 GiB except MemoryError as e: - stats = rmm.statistics.get_statistics() + stats = stats_mr.allocation_counts print(f"Peak before OOM: {stats.peak_bytes / 2**30:.2f} GiB") print(f"Check oom_debug.csv for allocation history") raise @@ -549,12 +476,10 @@ import rmm import time def benchmark_allocations(mr_name, mr): - rmm.mr.set_current_device_resource(mr) - start = time.time() buffers = [] for _ in range(1000): - buffers.append(rmm.DeviceBuffer(size=1024)) + buffers.append(rmm.DeviceBuffer(size=1024, mr=mr)) end = time.time() print(f"{mr_name}: {(end - start) * 1000:.2f} ms for 1000 allocations") @@ -562,8 +487,8 @@ def benchmark_allocations(mr_name, mr): # Compare resources benchmark_allocations("CudaMemoryResource", rmm.mr.CudaMemoryResource()) benchmark_allocations("CudaAsyncMemoryResource", rmm.mr.CudaAsyncMemoryResource()) -pool = rmm.mr.PoolMemoryResource(rmm.mr.CudaAsyncMemoryResource(), initial_pool_size=2**20) -benchmark_allocations("PoolMemoryResource", pool) +pool_mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**20) +benchmark_allocations("PoolMemoryResource", pool_mr) ``` ## Best Practices From 66296b450b8d9443495dec34e2cb90b2efbecbf8 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 20:33:23 +0000 Subject: [PATCH 20/24] Reduce managed_memory.md: remove CUDA-general content, explicit resource passing, improve tone --- docs/user_guide/managed_memory.md | 326 +++++------------------------- 1 file changed, 48 insertions(+), 278 deletions(-) diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md index a051a6db8..60b604ca6 100644 --- a/docs/user_guide/managed_memory.md +++ b/docs/user_guide/managed_memory.md @@ -1,267 +1,98 @@ # Managed Memory and Prefetching -CUDA Managed Memory (also called Unified Memory) allows memory to be accessed from both CPU and GPU, with automatic page migration managed by the CUDA driver. RMM provides `ManagedMemoryResource` to leverage this capability. +CUDA Managed Memory (also called Unified Memory) provides a single address space accessible from both CPU and GPU. The CUDA driver migrates pages between host and device memory on demand, which means you can work with datasets larger than GPU memory or share data between host and device code without explicit copies. -## What is Managed Memory? +RMM's `ManagedMemoryResource` allocates managed memory via `cudaMallocManaged`. For background on how Unified Memory works at the driver level, see the [CUDA Programming Guide: Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#unified-memory-programming). -Managed memory creates a single address space accessible from both CPU and GPU: +The main trade-off is performance: on-demand page migration introduces latency from page faults. For production workloads, combining managed memory with prefetching (described below) is essential to avoid this overhead. -- Allocations can be accessed using the same pointer from host or device code -- The CUDA driver automatically migrates pages between CPU and GPU as needed -- Enables working with datasets **larger than GPU memory** +## Prefetching -## When to Use Managed Memory +Without prefetching, the first GPU access to a managed allocation triggers a page fault that stalls execution while the driver migrates data from host memory. If the working set exceeds GPU memory, pages get evicted and re-faulted repeatedly, which can degrade performance severely. The [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#performance-tuning) covers page fault behavior and tuning in detail. -Managed memory is ideal for: +Prefetching migrates data to the GPU ahead of time so that kernels find it already resident. RMM supports two approaches. -1. **Datasets larger than GPU memory**: When your data doesn't fit in VRAM -2. **Prototyping**: Simplifies development by removing explicit memory transfers -3. **CPU-GPU interoperability**: When you need to access the same data from both host and device +### Prefetch on Allocate (Eager) -**Important**: Managed memory has performance implications. Always combine with prefetching for production workloads. - -## Basic Usage - -### Python +`PrefetchResourceAdaptor` wraps another resource and prefetches each allocation to the current device as soon as it's made. This works well when data is used on the GPU shortly after allocation, such as when copying or writing to the new allocation: ```python import rmm -# Use managed memory as the default resource -rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) - -# Allocations now use managed memory -buffer = rmm.DeviceBuffer(size=1000000) -``` - -### C++ - -```cpp -#include -#include - -auto managed_mr = rmm::mr::managed_memory_resource{}; -rmm::mr::set_current_device_resource_ref(managed_mr); +managed_mr = rmm.mr.ManagedMemoryResource() +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(managed_mr) -// Allocations use managed memory -rmm::cuda_stream stream; -rmm::device_buffer buffer(1000000, stream.view()); +# This allocation is prefetched to the GPU automatically +buffer = rmm.DeviceBuffer(size=1000000, mr=prefetch_mr) ``` -## Performance Considerations - -### Page Faults and Migration - -When the GPU accesses managed memory that is not resident on the GPU, a **page fault** occurs: - -1. GPU execution pauses -2. The driver migrates the page from CPU to GPU -3. GPU execution resumes - -These page faults can significantly impact performance, especially for: -- First-touch access patterns -- Random memory access -- Large datasets that don't fit in GPU memory - -### The Prefetching Solution - -**Prefetching** explicitly migrates data to the GPU before it's accessed, eliminating page faults. - -## Prefetching Strategies - -There are two main strategies for prefetching: - -### 1. Prefetch on Allocate (Eager Prefetching) - -Automatically prefetch memory to the GPU when it's allocated. This is useful when you know the data will be used on the GPU immediately after allocation. - -**Implementation: Use `PrefetchResourceAdaptor`** +Adding a pool between the managed resource and the prefetch adaptor avoids calling `cudaMallocManaged` on every allocation. The pool grabs large chunks of managed memory upfront, and the prefetch adaptor ensures each suballocation is migrated to the GPU before use. Non-allocating adaptors like logging or statistics can safely wrap the prefetch adaptor on the outside: ```python import rmm -# Wrap managed memory with prefetch adaptor -base = rmm.mr.ManagedMemoryResource() -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) -rmm.mr.set_current_device_resource(prefetch_mr) - -# Every allocation is automatically prefetched to the GPU -buffer = rmm.DeviceBuffer(size=1000000) -# Buffer is already on the GPU, no page faults on first access -``` +managed_mr = rmm.mr.ManagedMemoryResource() +pool_mr = rmm.mr.PoolMemoryResource(managed_mr, initial_pool_size=2**30) +prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool_mr) -**With a pool:** +# Logging and statistics don't allocate, so they can go on the outside +stats_mr = rmm.mr.StatisticsResourceAdaptor(prefetch_mr) +log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") -```python -import rmm - -# Combine managed memory, pool, and prefetching -base = rmm.mr.ManagedMemoryResource() -pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) -rmm.mr.set_current_device_resource(prefetch_mr) +buffer = rmm.DeviceBuffer(size=1000000, mr=log_mr) ``` -**When to use:** -- Allocations are immediately used on the GPU -- You want automatic prefetching without code changes - -### 2. Prefetch on Access (Lazy Prefetching) - -Explicitly prefetch data just before it's used in a kernel. This gives finer control and can optimize for specific access patterns. - -**Implementation: Manual prefetch calls** - -```python -import rmm - -rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) - -# Allocate managed memory (not prefetched yet) -buffer = rmm.DeviceBuffer(size=1000000) +### Prefetch on Access (Lazy) -# ... later, just before using on GPU ... -from rmm.pylibrmm.stream import Stream - -stream = Stream() -buffer.prefetch(device=0, stream=stream) # Prefetch to device 0 - -# Launch kernel on the same stream -# ... kernel will not page fault ... -``` - -**In C++:** +When you need control over exactly when data moves to the GPU — for instance because the allocation happens long before the kernel that consumes it — you can prefetch manually: -```cpp +`````{tabs} +````{code-tab} c++ #include #include #include -auto managed_mr = rmm::mr::managed_memory_resource{}; -rmm::mr::set_current_device_resource_ref(managed_mr); - +rmm::mr::managed_memory_resource managed_mr; rmm::cuda_stream stream; -rmm::device_buffer buffer(1000000, stream.view()); +rmm::device_buffer buffer(1000000, stream.view(), managed_mr); -// Prefetch before using +// Prefetch to the current device on this stream rmm::prefetch(buffer.data(), buffer.size(), rmm::get_current_cuda_device(), stream.view()); -// Launch kernel +// Kernel on the same stream finds the data already resident launch_kernel<<>>(buffer.data()); -``` - -**When to use:** -- You need fine-grained control over when data is prefetched -- Access patterns are complex or dynamic -- You're optimizing for specific workload characteristics - -## Practical Example: PyTorch with Larger-Than-VRAM Models - -Here's how to use managed memory with PyTorch to work with models or data larger than GPU memory: - -```python +```` +````{code-tab} python import rmm -import torch -from rmm.allocators.torch import rmm_torch_allocator - -# Use managed memory with prefetching -base = rmm.mr.ManagedMemoryResource() -pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30, maximum_pool_size=2**34) -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) -rmm.mr.set_current_device_resource(prefetch_mr) - -# Configure PyTorch to use RMM -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) - -# Now you can work with larger-than-VRAM data -# Example: Large tensor that doesn't fit in VRAM -large_tensor = torch.randn(100000, 100000, device='cuda') # ~40 GB - -# Operations will automatically page as needed -result = large_tensor @ large_tensor.T -``` - -**What happens:** -1. RMM allocates managed memory for tensors -2. The prefetch adaptor prefetches to GPU on allocation -3. If memory exceeds GPU capacity, pages migrate between CPU and GPU -4. Performance is better than without prefetching - -## Prefetching Best Practices - -### 1. Prefetch Adaptor Should Be Outermost - -When composing memory resources, always make the prefetch adaptor the outermost layer: - -```python -# Correct: Prefetch is outermost -base = rmm.mr.ManagedMemoryResource() -pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) -stats = rmm.mr.StatisticsResourceAdaptor(pool) -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(stats) # Outermost -rmm.mr.set_current_device_resource(prefetch_mr) - -# Incorrect: Prefetch is not outermost -base = rmm.mr.ManagedMemoryResource() -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) -pool = rmm.mr.PoolMemoryResource(prefetch_mr, initial_pool_size=2**30) # Wrong! -``` - -### 2. Prefetch on the Correct Stream - -When manually prefetching, use the same stream as the subsequent kernel: - -```python from rmm.pylibrmm.stream import Stream -stream = Stream() +managed_mr = rmm.mr.ManagedMemoryResource() +buffer = rmm.DeviceBuffer(size=1000000, mr=managed_mr) -# Prefetch on stream +# Prefetch to device 0 on this stream +stream = Stream() buffer.prefetch(device=0, stream=stream) -# Launch kernel on the same stream to avoid page faults -``` - -### 3. Prefetch Size Considerations +# Kernel on the same stream finds the data already resident +```` +````` -Prefetching is most effective when: -- The prefetch size is large enough to amortize the migration cost -- Data is used shortly after prefetching -- Access patterns are predictable +## Prefetching Best Practices -### 4. Profile and Measure +### Stream ordering -Always profile to verify that prefetching improves performance: +When prefetching manually, issue the prefetch on the same stream as the kernel that will consume the data. This guarantees the migration completes before the kernel launches. -```python -import rmm -import time - -# Without prefetching -rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) -buffer = rmm.DeviceBuffer(size=10**9) -start = time.time() -# ... run workload ... -print(f"Without prefetch: {time.time() - start:.2f}s") - -# With prefetching -base = rmm.mr.ManagedMemoryResource() -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) -rmm.mr.set_current_device_resource(prefetch_mr) -buffer = rmm.DeviceBuffer(size=10**9) -start = time.time() -# ... run workload ... -print(f"With prefetch: {time.time() - start:.2f}s") -``` +### Profiling -Use [NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) to visualize page faults and data migration: +[NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems) can visualize page faults and data migration to help you decide where prefetching is needed: ```bash nsys profile -o output python your_script.py ``` -When using `compute-sanitizer` with managed memory, you may need to enable page fault tracking: +When using `compute-sanitizer` with managed memory, enable page fault tracking: ```bash compute-sanitizer --tool memcheck \ @@ -270,76 +101,15 @@ compute-sanitizer --tool memcheck \ python your_script.py ``` -## Managed Memory Limitations - -### 1. Not Stream-Ordered - -`ManagedMemoryResource` uses `cudaMallocManaged`, which is **synchronous**. Allocations block until complete, unlike stream-ordered resources. - -For better performance in multi-stream applications, use `CudaAsyncMemoryResource` instead. - -### 2. Performance Overhead - -Even with prefetching, managed memory has overhead compared to explicit memory management: -- Page fault handling -- Driver page migration -- Potential CPU-GPU transfer latency - -For performance-critical code with data that fits in GPU memory, prefer `CudaAsyncMemoryResource`. - -### 3. PCIe Bandwidth Limitation - -If your workload constantly migrates data between CPU and GPU, you're limited by PCIe bandwidth: -- PCIe Gen3 x16: ~12 GB/s -- PCIe Gen4 x16: ~24 GB/s -- PCIe Gen5 x16: ~48 GB/s - -For such workloads, consider: -- Algorithmic changes to reduce data movement -- Using system memory as a staging area -- Streaming data in smaller chunks - -## Comparison: Prefetch Strategies - -| Strategy | Advantages | Disadvantages | Use Case | -|----------|-----------|---------------|----------| -| **PrefetchResourceAdaptor** | Automatic, no code changes | Prefetches everything, even if not needed | General-purpose, allocate-and-use patterns | -| **Manual prefetch** | Fine-grained control, can optimize specific patterns | Requires code changes | Complex access patterns, performance tuning | -| **No prefetching** | Simple | High page fault overhead | Prototyping only, not for production | - -## Multi-GPU Considerations - -When using managed memory with multiple GPUs: - -```python -import rmm -from cuda.bindings import runtime as cudart - -# Set up managed memory on each device -for device_id in [0, 1]: - cudart.cudaSetDevice(device_id) - base = rmm.mr.ManagedMemoryResource() - prefetch_mr = rmm.mr.PrefetchResourceAdaptor(base) - rmm.mr.set_per_device_resource(device_id, prefetch_mr) - -# Prefetch to specific devices -buffer = rmm.DeviceBuffer(size=1000000) -buffer.prefetch(device=0, stream=stream_0) # Prefetch to GPU 0 -buffer.prefetch(device=1, stream=stream_1) # Prefetch to GPU 1 -``` - -## Summary +## Limitations -- Managed memory enables larger-than-VRAM workloads and simplifies CPU-GPU interoperability -- Always use prefetching in production to avoid page fault overhead -- Use `PrefetchResourceAdaptor` for automatic, eager prefetching -- Use manual `prefetch()` calls for fine-grained control -- Profile with Nsight Systems to measure page fault overhead -- For best performance with data that fits in VRAM, use `CudaAsyncMemoryResource` instead +- **Not stream-ordered**: `ManagedMemoryResource` uses `cudaMallocManaged`, which is synchronous — the call blocks until the allocation is complete. For multi-stream applications where allocation latency matters, prefer `CudaAsyncMemoryResource`. +- **Migration overhead**: Even with prefetching, managed memory carries overhead from driver-managed page migration. If your data fits comfortably in GPU memory, `CudaAsyncMemoryResource` avoids this cost entirely. +- **Interconnect bandwidth**: Workloads that constantly migrate data between CPU and GPU are bounded by the throughput of the CPU-GPU interconnect (PCIe, NVLink-C2C, etc.). ## See Also - [Choosing a Memory Resource](choosing_memory_resources.md) - When to use managed memory vs. other resources - [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding asynchronous allocation semantics +- [CUDA Programming Guide: Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#unified-memory-programming) - [NVIDIA Developer Blog: Unified Memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) -- [NVIDIA Developer Blog: Memory Oversubscription](https://developer.nvidia.com/blog/improving-gpu-memory-oversubscription-performance/) From 4f14530836b49b49b1a6463f5d6f643531ec6d6e Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 12 Apr 2026 21:48:53 +0000 Subject: [PATCH 21/24] Reduce stream_ordered_allocation.md: explicit resources, cross-stream event examples, cuda.core streams --- docs/user_guide/stream_ordered_allocation.md | 400 +++++++------------ 1 file changed, 154 insertions(+), 246 deletions(-) diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md index dbcb8a43a..f5f93505f 100644 --- a/docs/user_guide/stream_ordered_allocation.md +++ b/docs/user_guide/stream_ordered_allocation.md @@ -1,324 +1,232 @@ # Stream-Ordered Memory Allocation -RMM provides **stream-ordered memory allocation**, which means that memory allocations and deallocations are ordered with respect to operations on a CUDA stream. This is a fundamental concept for achieving optimal performance in asynchronous CUDA applications. - -## What is Stream-Ordered Allocation? - -In stream-ordered allocation: - -1. **Allocations are asynchronous**: Calling `allocate()` schedules the allocation on a stream and returns a pointer immediately -2. **The pointer is usable immediately**: The returned pointer can be stored and used for any operations that are stream-ordered after the allocation (e.g., kernel launches on the same stream, copy operations on the same stream, or operations on another stream that has been synchronized with the allocating stream using CUDA events) -3. **Deallocations are also stream-ordered**: Memory is not actually freed until all prior operations on the stream complete - -This allows memory operations to be interleaved with kernel launches and other CUDA operations without explicit synchronization. - -## Why Stream-Ordered Allocation Matters - -Traditional memory allocation (e.g., `cudaMalloc`) is **synchronous** - it blocks until the allocation completes. This creates bubbles in the execution pipeline where the CPU waits for GPU operations to complete. - -Stream-ordered allocation enables: -- **Overlapping compute and memory operations**: Allocations can be scheduled while kernels are running -- **Reduced synchronization overhead**: No need to synchronize the stream before allocating -- **Better multi-stream performance**: Different streams can allocate independently +RMM containers and memory resources are stream-ordered: allocations and deallocations are enqueued on a CUDA stream rather than blocking the CPU. This lets memory operations overlap with kernel execution and avoids the synchronization cost of `cudaMalloc`/`cudaFree`. For background on CUDA streams and asynchronous execution, see the [CUDA Programming Guide: Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/asynchronous-execution.html#what-is-asynchronous-concurrent-execution). ## How It Works -Consider the following example of allocating memory from a stream-ordered memory resource. +When you allocate from a stream-ordered resource, the call returns a pointer immediately. The pointer value is available on the CPU right away — you can store it, pass it to kernel launch arguments, or hand it to another API. The memory backing behind the pointer becomes available for GPU operations enqueued on the same stream after the allocation: -C++: - -```cpp +`````{tabs} +````{code-tab} c++ #include #include -rmm::cuda_stream_view stream; -auto buffer = rmm::device_buffer(1000, stream); -``` - -Python: - -```python -import rmm - -# Allocate on a specific stream -from rmm.pylibrmm.stream import Stream - -stream = Stream() -buffer = rmm.DeviceBuffer(size=1000, stream=stream) -``` - -The following happens: - -1. The allocation request is **scheduled** on `stream` -2. The function returns immediately (asynchronous) -3. The memory is **guaranteed to be available** for operations enqueued on `stream` after the allocation -4. You can use `buffer.data()` (the pointer) immediately in subsequent stream operations - -## Key Semantics - -### Safe to Use the Pointer Immediately - -**You can use the returned pointer in stream-ordered operations without synchronization:** +rmm::mr::cuda_async_memory_resource mr; +rmm::cuda_stream stream; +rmm::device_buffer buffer(1000, stream.view(), mr); -```python +// buffer.data() is usable immediately in stream-ordered operations +launch_kernel<<>>(buffer.data()); +```` +````{code-tab} python import rmm from rmm.pylibrmm.stream import Stream +mr = rmm.mr.CudaAsyncMemoryResource() stream = Stream() +buffer = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) -# Allocate memory on the stream -buffer = rmm.DeviceBuffer(size=1000, stream=stream) - -# The pointer (buffer.ptr) is available immediately and can be passed to -# stream-ordered operations (e.g., kernel launches) on the same stream -# without synchronization. -``` - -The allocation is guaranteed to complete before the kernel that uses it, as long as both are on the same stream. - -### Deallocations Are Also Stream-Ordered - -When you deallocate (e.g., a buffer goes out of scope), the deallocation is also stream-ordered: - -```python -import rmm -from rmm.pylibrmm.stream import Stream +# buffer.ptr is usable immediately in stream-ordered operations +```` +````` -stream = Stream() +Deallocations are also stream-ordered: when a buffer is destroyed, the deallocation is enqueued on the stream, so the memory is not actually freed until all prior work on that stream completes. -# Allocate -buffer = rmm.DeviceBuffer(size=1000, stream=stream) +## When to Synchronize -# Schedule some work on the stream -# ... kernels using buffer.ptr ... +### Reading results on the host -# When buffer is destroyed, deallocation is scheduled on the stream -# The memory won't actually be freed until all prior work completes -buffer = None # triggers deallocation -``` +The pointer returned by a stream-ordered allocation is a CPU value — you can store it or pass it to other APIs without synchronization. However, the stream must be synchronized before the CPU reads data that was written by GPU operations on that stream. The most common case is a device-to-host copy followed by a sync: -This ensures that: -- Memory is not freed while still in use by a kernel -- Deallocations don't block waiting for kernels to complete +`````{tabs} +````{code-tab} c++ +rmm::mr::cuda_async_memory_resource mr; +rmm::cuda_stream stream; +rmm::device_buffer d_buf(1000 * sizeof(float), stream.view(), mr); -### Stream Synchronization +// Launch kernel that writes to d_buf on stream ... -The pointer returned by a stream-ordered allocation is available on the CPU immediately — it can be stored, compared, or passed to other API calls without synchronization. Synchronization is only needed before accessing the *contents* of the GPU memory from the CPU: +// Copy results to host on the same stream +std::vector h_buf(1000); +cudaMemcpyAsync(h_buf.data(), d_buf.data(), d_buf.size(), + cudaMemcpyDeviceToHost, stream.value()); -```python +// Synchronize before reading h_buf on the CPU +stream.synchronize(); +```` +````{code-tab} python import rmm from rmm.pylibrmm.stream import Stream +mr = rmm.mr.CudaAsyncMemoryResource() stream = Stream() -buffer = rmm.DeviceBuffer(size=1000, stream=stream) +d_buf = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) -# buffer.ptr is available immediately (no sync needed to use the pointer) -print(f"Pointer: {buffer.ptr}") # OK +# ... GPU work writes to d_buf on stream ... -# To read GPU memory contents from the CPU, synchronize first +# Async copy to host on the same stream, then sync before reading +h_buf = bytearray(d_buf.size) +d_buf.copy_to_host(h_buf, stream) stream.synchronize() -``` +```` +````` -## Memory Resources and Stream Ordering +### Cross-stream usage -### Which Resources Support Stream Ordering? +Memory allocated on one stream can only be safely used on a different stream after the allocation is known to have completed. The simplest approach is to synchronize the allocating stream, but that stalls the CPU. A lighter-weight alternative is to record a CUDA event on the allocating stream and have the consuming stream wait on it: -- **`CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) -- **`PoolMemoryResource`**: Internally stream-safe — suballocations are mutex-protected, independent of upstream -- **`ArenaMemoryResource`**: Internally stream-safe — uses per-stream arenas, independent of upstream -- **`CudaMemoryResource`**: NOT stream-ordered (`cudaMalloc` is synchronous) -- **`ManagedMemoryResource`**: NOT stream-ordered (`cudaMallocManaged` is synchronous) +`````{tabs} +````{code-tab} c++ +#include +#include -### Example +rmm::mr::cuda_async_memory_resource mr; +rmm::cuda_stream stream_a; +rmm::cuda_stream stream_b; -```python -import rmm -from rmm.pylibrmm.stream import Stream - -rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) +rmm::device_buffer buffer(1000, stream_a.view(), mr); -stream = Stream() -buffer = rmm.DeviceBuffer(size=1000, stream=stream) -``` +// Record an event after the allocation on stream_a +cudaEvent_t event; +cudaEventCreateWithFlags(&event, cudaEventDisableTiming); +cudaEventRecord(event, stream_a.value()); -## Common Patterns +// stream_b waits for the event — no CPU synchronization needed +cudaStreamWaitEvent(stream_b.value(), event); -### Pattern 1: Allocate and Use in Kernel +// Now safe to use buffer.data() in operations on stream_b +launch_kernel<<>>(buffer.data()); -```python +cudaEventDestroy(event); +```` +````{code-tab} python import rmm from rmm.pylibrmm.stream import Stream -from numba import cuda +from cuda.core import Device -@cuda.jit -def kernel(data, n): - idx = cuda.grid(1) - if idx < n: - data[idx] = idx * 2 +dev = Device() +dev.set_current() -stream = Stream() +mr = rmm.mr.CudaAsyncMemoryResource() +stream_a = dev.create_stream() +stream_b = dev.create_stream() -# Allocate -buffer = rmm.DeviceBuffer(size=1000 * 4, stream=stream) # 1000 float32s +buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) -# Launch kernel on the same stream -numba_stream = cuda.external_stream(stream.__cuda_stream__()[1]) -kernel[100, 10, numba_stream](cuda.as_cuda_array(buffer).view('float32'), 1000) +# Record an event after the allocation on stream_a +alloc_event = dev.create_event(options={"enable_timing": False}) +stream_a.record(alloc_event) -# Synchronize to wait for kernel -stream.synchronize() -``` +# stream_b waits for the event — no CPU synchronization needed +stream_b.wait(alloc_event) -### Pattern 2: Allocate, Compute, Deallocate, Repeat +# Now safe to use buffer.ptr in operations on stream_b +```` +````` -```python -import rmm -from rmm.pylibrmm.stream import Stream +### Buffer lifetime across streams -stream = Stream() +If a buffer is allocated and used on the same stream, deallocation is safe — stream ordering guarantees prior work completes first. The problem arises when a buffer is used on a *different* stream from the one it will be deallocated on. In that case, you need to ensure the consuming stream's work finishes before the buffer is destroyed. The same event pattern works here — record an event on the consuming stream and have the deallocating stream wait on it: -for i in range(100): - # Allocate - buffer = rmm.DeviceBuffer(size=1000000, stream=stream) +`````{tabs} +````{code-tab} c++ +rmm::mr::cuda_async_memory_resource mr; +rmm::cuda_stream stream_a; +rmm::cuda_stream stream_b; - # Use buffer in computations - # ... launch kernels on stream ... +rmm::device_buffer buffer(1000, stream_a.view(), mr); - # Deallocate (automatic, or explicitly set buffer = None) - buffer = None +// Make stream_b wait for the allocation on stream_a +cudaEvent_t alloc_event; +cudaEventCreateWithFlags(&alloc_event, cudaEventDisableTiming); +cudaEventRecord(alloc_event, stream_a.value()); +cudaStreamWaitEvent(stream_b.value(), alloc_event); -# All allocations and deallocations are stream-ordered -# No need to synchronize between iterations -``` +// Use buffer on stream_b +launch_kernel<<>>(buffer.data()); -### Pattern 3: Multi-Stream Allocation +// Before destroying buffer, make stream_a wait for stream_b's work +cudaEvent_t done_event; +cudaEventCreateWithFlags(&done_event, cudaEventDisableTiming); +cudaEventRecord(done_event, stream_b.value()); +cudaStreamWaitEvent(stream_a.value(), done_event); -```python +// Now safe to destroy buffer — deallocation on stream_a is ordered after the kernel on stream_b +buffer = rmm::device_buffer{}; + +cudaEventDestroy(alloc_event); +cudaEventDestroy(done_event); +```` +````{code-tab} python import rmm from rmm.pylibrmm.stream import Stream +from cuda.core import Device -# Create multiple streams -streams = [Stream() for _ in range(4)] - -# Allocate on different streams independently -buffers = [] -for stream in streams: - # Each allocation is independent - buffer = rmm.DeviceBuffer(size=1000000, stream=stream) - buffers.append(buffer) - - # Launch work on this stream - # ... kernels using buffer ... - -# Synchronize all streams -for stream in streams: - stream.synchronize() -``` - -## Performance Implications - -### Benefits - -1. **Reduced CPU-GPU synchronization**: No blocking on allocations -2. **Better pipeline utilization**: Memory operations overlap with compute -3. **Multi-stream scalability**: Streams can allocate independently - -### Pitfalls to Avoid - -1. **Don't mix streams**: Using memory allocated on stream A in operations on stream B requires synchronization: - - ```python - from rmm.pylibrmm.stream import Stream +dev = Device() +dev.set_current() - stream_a = Stream() - stream_b = Stream() +mr = rmm.mr.CudaAsyncMemoryResource() +stream_a = dev.create_stream() +stream_b = dev.create_stream() - # Allocate on stream A - buffer = rmm.DeviceBuffer(size=1000, stream=stream_a) +buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) - # To use on stream B, synchronize stream A first - stream_a.synchronize() +# Make stream_b wait for the allocation on stream_a +alloc_event = dev.create_event(options={"enable_timing": False}) +stream_a.record(alloc_event) +stream_b.wait(alloc_event) - # Now safe to use buffer in operations on stream B - ``` +# Use buffer on stream_b ... -2. **Synchronize before reading GPU memory from the CPU**: The pointer is available immediately, but the memory contents are not readable from the CPU until the stream catches up: +# Before destroying buffer, make stream_a wait for stream_b's work +done_event = dev.create_event(options={"enable_timing": False}) +stream_b.record(done_event) +stream_a.wait(done_event) - ```python - from rmm.pylibrmm.stream import Stream +# Now safe to destroy buffer +del buffer +```` +````` - stream = Stream() - buffer = rmm.DeviceBuffer(size=1000, stream=stream) +## Which Resources Support Stream Ordering? - # buffer.ptr is usable immediately (e.g., pass to a kernel) - # Synchronize the stream before reading memory contents from the CPU. - stream.synchronize() - ``` - -3. **Resource lifetime**: Ensure buffers live until all stream operations complete: - - ```python - from rmm.pylibrmm.stream import Stream - - stream = Stream() - - def allocate_and_use(): - buffer = rmm.DeviceBuffer(size=1000, stream=stream) - # Launch kernel using buffer - kernel[...](buffer.ptr) - # BAD: buffer is deallocated when function returns - # but kernel may still be running! - - allocate_and_use() - stream.synchronize() # May crash - buffer already freed - ``` - - Fix: Keep buffer alive until synchronization: - - ```python - from rmm.pylibrmm.stream import Stream +- **`CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) +- **`PoolMemoryResource`**: Internally stream-safe — suballocations are mutex-protected, independent of upstream +- **`ArenaMemoryResource`**: Internally stream-safe — uses per-stream arenas, independent of upstream +- **`CudaMemoryResource`**: NOT stream-ordered (`cudaMalloc` is synchronous) +- **`ManagedMemoryResource`**: NOT stream-ordered (`cudaMallocManaged` is synchronous) - stream = Stream() - buffer = allocate_and_use() # Return the buffer - stream.synchronize() # Now safe - buffer = None # Explicit cleanup after sync - ``` +## Example: Numba Kernel with RMM Stream -## C++ API +This example allocates an RMM buffer and launches a Numba kernel on the same stream, so the allocation is guaranteed to complete before the kernel accesses the memory: -In C++, stream-ordered allocation is the default for most RMM containers: +```python +import rmm +from rmm.pylibrmm.stream import Stream +from cuda.core import Device +from numba import cuda -```cpp -#include -#include -#include -#include +dev = Device() +dev.set_current() -// Set async MR as default -auto async_mr = rmm::mr::cuda_async_memory_resource{}; -rmm::mr::set_current_device_resource_ref(async_mr); +@cuda.jit +def kernel(data, n): + idx = cuda.grid(1) + if idx < n: + data[idx] = idx * 2 -// Create a stream -rmm::cuda_stream stream; +mr = rmm.mr.CudaAsyncMemoryResource() +stream = dev.create_stream() -// Allocate stream-ordered memory -rmm::device_buffer buffer(1000, stream.view()); -rmm::device_uvector vec(1000, stream.view()); +buffer = rmm.DeviceBuffer(size=1000 * 4, stream=Stream(obj=stream), mr=mr) -// Use immediately in stream-ordered operations -launch_kernel<<>>(buffer.data(), vec.data()); +numba_stream = cuda.external_stream(int(stream.handle)) +kernel[100, 10, numba_stream](cuda.as_cuda_array(buffer).view('float32'), 1000) -// Synchronize -stream.synchronize(); +stream.sync() ``` -## Summary - -- Stream-ordered allocation enables asynchronous, non-blocking memory operations -- Allocated pointers are available immediately and can be used in operations on the same stream -- Deallocations are also stream-ordered, preventing use-after-free -- `CudaAsyncMemoryResource` provides the best stream-ordered allocation support -- Synchronize before reading GPU memory contents from the CPU -- Ensure buffer lifetimes extend until all stream operations complete +## See Also -For more details on choosing memory resources, see [Choosing a Memory Resource](choosing_memory_resources.md). +- [Choosing a Memory Resource](choosing_memory_resources.md) - Which resources support stream ordering +- [CUDA Programming Guide: Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/asynchronous-execution.html#what-is-asynchronous-concurrent-execution) From 704cb53cd458cd3c5024c04790d90455b2a93190 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 13 Apr 2026 12:51:50 +0000 Subject: [PATCH 22/24] Polish introduction.md: reduce bold, fix CCCL statement, explicit resources, tabbed example --- docs/user_guide/introduction.md | 138 ++++++-------------------------- 1 file changed, 23 insertions(+), 115 deletions(-) diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md index 19cac18f9..9ce382e40 100644 --- a/docs/user_guide/introduction.md +++ b/docs/user_guide/introduction.md @@ -1,142 +1,50 @@ # Introduction to RMM -**RMM (RAPIDS Memory Manager)** is a library for allocating and managing GPU memory in C++ and Python. It provides a flexible interface for customizing how device memory is allocated, along with efficient implementations and containers. +RMM (RAPIDS Memory Manager) is a C++ and Python library for GPU memory allocation. It provides a common interface — the **memory resource** — that lets you swap allocation strategies at runtime without recompiling, and a set of containers that manage device memory lifetime automatically. -## Purpose +GPU applications often benefit from customizing how memory is allocated. For example, pooling reduces the overhead of frequent small allocations, managed memory enables working with datasets larger than GPU memory, and pinned host memory speeds up CPU-GPU transfers compared to pageable host memory. RMM provides these and other features as interchangeable memory resources, so you can experiment with different strategies and measure their impact on your workload. -Achieving optimal performance in GPU-accelerated applications frequently requires customizing memory allocation strategies. For example: +RMM provides integrations with GPU libraries including cuDF, cuML, cuGraph, PyTorch, and CuPy, enabling uniform memory handling across your application. -- Using **memory pools** to reduce the overhead of dynamic allocation -- Using **managed memory** to work with datasets larger than GPU memory -- Using **pinned host memory** for faster asynchronous CPU ↔ GPU transfers -- Customizing allocation strategies for specific workload patterns +## Key Concepts -RMM provides a unified interface, called a **memory resource**, which is a building block for GPU-accelerated applications. +### Memory Resources -Memory resources provide a **minimal-overhead abstraction** over memory allocation that is **pluggable at runtime**, making it possible to debug, measure performance, and optimize a CUDA application without recompiling. -Memory resources aim to serve the needs of a wide range of applications, from data science and machine learning to high-performance simulation. +A memory resource is an object that knows how to allocate and deallocate memory. The choice of resource determines the kind of memory (device, host, managed, pinned) and the allocation strategy (pooled, stream-ordered, etc.). RMM's resources implement the `cuda::mr::resource` concept defined by [CCCL](https://github.com/NVIDIA/cccl) (CUDA Core Compute Libraries), so they interoperate directly with any library that accepts CCCL resources. -RMM's memory resources leverage CUDA features like **stream-ordered** (asynchronous) pipeline parallelism, **managed** memory (also known as unified virtual memory, UVM), and **pinned** memory, making it easier to write complex workflows that optimally use both device and host memory. -The integrations provided in RMM allow memory resources to benefit memory management across libraries frequently used together, such as **PyTorch** and **RAPIDS**. +For most applications, the CUDA async memory resource (`rmm::mr::cuda_async_memory_resource` in C++, `rmm.mr.CudaAsyncMemoryResource` in Python) is a good starting point — it uses a CUDA driver-managed pool and supports stream-ordered (asynchronous) allocations. See [Choosing a Memory Resource](choosing_memory_resources.md) for guidance on when to use other resources. -## Key Features +### Resource Adaptors -RMM is built around three main concepts. +Resource adaptors wrap an existing resource to add functionality. For example, `StatisticsResourceAdaptor` tracks allocation statistics, and `LoggingResourceAdaptor` logs allocations to a CSV file. Adaptors are composable — you can stack several to get combined functionality. See [Logging and Profiling](logging.md) for details. -### 1. Memory Resources +### Containers -Memory resources provide a common abstraction for device memory allocation. -The API of RMM's memory resources is based on the CCCL memory resource design to facilitate interoperability. +RMM provides [RAII](https://en.cppreference.com/w/cpp/language/raii.html) containers that manage device memory lifetime, avoiding common problems like memory leaks or improper stream ordering: -The choice of resource determines the underlying type of memory and thus its accessibility from host or device. -For example, the `cuda_async_memory_resource` uses a pool of memory managed by the CUDA driver. -This resource is recommended for most applications, because of its performance and support for asynchrous (stream-ordered) allocations. See [Stream-Ordered Allocation](stream_ordered_allocation.md) for details. -As another example, the `managed_memory_resource` provides unified memory for CPU+GPU, and is recommended for applications exceeding the available GPU memory. +- C++: `device_buffer` (untyped), `device_uvector` (typed, uninitialized), `device_scalar` (single element) +- Python: `DeviceBuffer` (untyped) -See [Choosing a Memory Resource](choosing_memory_resources.md) for guidance on the available memory resources, performance considerations, and how they fit into efficient CUDA application design strategies. -[NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) can be used to profile memory resource performance. - -### 2. Resource Adaptors - -Resource adaptors wrap and add functionality to existing resources. -For example, the `statistics_resource_adaptor` can be used to track allocation statistics. -The `logging_resource_adaptor` logs allocations to a CSV file. -Adaptors are composable - wrap multiple adaptors for combined functionality. - -### 3. Containers - -RMM provides [RAII](https://en.cppreference.com/w/cpp/language/raii.html) container classes that manage memory lifetime. -Using these containers avoids common problems with performing raw allocation such as memory leaks or improper stream ordering. -- `device_buffer`: Untyped device memory -- `device_uvector`: Typed, uninitialized vector of device memory (trivially copyable types) -- `device_scalar`: Single typed element - -All containers use stream-ordered allocation and work with any memory resource. +All containers accept a stream and a memory resource, and use stream-ordered allocation. ## Basic Example -### C++ - -```cpp +`````{tabs} +````{code-tab} c++ #include #include -// Use CUDA async memory pool -auto async_mr = rmm::mr::cuda_async_memory_resource{}; -rmm::mr::set_current_device_resource_ref(async_mr); - -// Allocate device memory asynchronously +rmm::mr::cuda_async_memory_resource mr; rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view()); -stream.synchronize(); -``` - -### Python - -```python -import rmm - -# Use CUDA async memory pool -mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) - -# Allocate device memory -buffer = rmm.DeviceBuffer(size=1024) -``` - -## Integration with GPU Libraries - -RMM integrates seamlessly with popular GPU libraries: - -### PyTorch - -Set the PyTorch allocator to use the current device resource: - -```python -import rmm -import torch -from rmm.allocators.torch import rmm_torch_allocator - -mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) -``` - -### CuPy - -Set the CuPy allocator to use the current device resource: - -```python -import rmm -import cupy -from rmm.allocators.cupy import rmm_cupy_allocator - -mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) -cupy.cuda.set_allocator(rmm_cupy_allocator) - -# CuPy allocations now use RMM -array = cupy.zeros(1000) -``` - -### Numba - -When launching a script: -```bash -NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python script.py -``` - -Or from Python: - -```python +rmm::device_buffer buffer(1024, stream.view(), mr); +```` +````{code-tab} python import rmm -from numba import cuda -from rmm.allocators.numba import RMMNumbaManager mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) -cuda.set_memory_manager(RMMNumbaManager) -``` +buffer = rmm.DeviceBuffer(size=1024, mr=mr) +```` +````` ## Resources and Support From 0a634e0f93aeaf2d9ad176ae8f6810888307a7dc Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Mon, 13 Apr 2026 15:51:32 +0000 Subject: [PATCH 23/24] Add Sphinx cross-references, explicit resource passing, and minor fixes across user guide - Add {cpp:class} and {py:class} Sphinx cross-references to exact classes throughout all user guide files (C++ listed first) - Use rmm::align_down in pool size calculation - Use explicit resource passing in choosing guide defaults and compositions - Readable section headers in choosing guide (e.g. CUDA Async Memory Resource) - Reorder and revise choosing guide: remove Arena, improve Pool and Managed descriptions with Features/Caution structure - Add cuda-version explanation and nightly stability note in installation.md - Rewrite logging.md intro with specific adaptor cross-references --- docs/user_guide/choosing_memory_resources.md | 133 +++++++------------ docs/user_guide/guide.md | 16 +-- docs/user_guide/installation.md | 4 +- docs/user_guide/introduction.md | 10 +- docs/user_guide/logging.md | 6 +- docs/user_guide/managed_memory.md | 4 +- docs/user_guide/stream_ordered_allocation.md | 12 +- 7 files changed, 78 insertions(+), 107 deletions(-) diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md index f876c6724..64c27611b 100644 --- a/docs/user_guide/choosing_memory_resources.md +++ b/docs/user_guide/choosing_memory_resources.md @@ -2,7 +2,7 @@ One of the most common questions when using RMM is: "Which memory resource should I use?" -This guide recommends memory resources based on optimal allocation performance for common workloads. +This guide recommends memory resources based on optimal allocation performance for common workloads. See the API references for the full list of available resources. ## Recommended Defaults @@ -11,16 +11,17 @@ For most applications, the CUDA async memory pool provides the best allocation p `````{tabs} ````{code-tab} c++ #include -#include +#include rmm::mr::cuda_async_memory_resource mr; -rmm::mr::set_current_device_resource_ref(mr); +rmm::cuda_stream stream; +rmm::device_buffer buffer(1024, stream.view(), mr); ```` ````{code-tab} python import rmm mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) +buffer = rmm.DeviceBuffer(size=1024, mr=mr) ```` ````` @@ -31,17 +32,16 @@ For applications that require GPU memory oversubscription (allocating more memor #include #include #include -#include +#include #include // Use 80% of GPU memory, rounded down to nearest 256 bytes auto [free_memory, total_memory] = rmm::available_device_memory(); -std::size_t pool_size = (static_cast(total_memory * 0.8) / 256) * 256; +auto pool_size = rmm::align_down(static_cast(total_memory * 0.8), 256); rmm::mr::managed_memory_resource managed_mr; rmm::mr::pool_memory_resource pool_mr{managed_mr, pool_size}; rmm::mr::prefetch_resource_adaptor prefetch_mr{pool_mr}; -rmm::mr::set_current_device_resource_ref(prefetch_mr); ```` ````{code-tab} python import rmm @@ -56,7 +56,6 @@ mr = rmm.mr.PrefetchResourceAdaptor( initial_pool_size=pool_size, ) ) -rmm.mr.set_current_device_resource(mr) ```` ````` @@ -64,13 +63,13 @@ rmm.mr.set_current_device_resource(mr) Resources that use the CUDA driver's pool suballocation (`cudaMallocFromPoolAsync`) provide fast allocation performance because the driver can manage virtual address space efficiently and reduce fragmentation. -### CudaAsyncMemoryResource +### CUDA Async Memory Resource -The `CudaAsyncMemoryResource` allocates from a custom CUDA memory pool using `cudaMallocFromPoolAsync`. This is the **recommended default** for most applications. +{cpp:class}`~rmm::mr::cuda_async_memory_resource` (C++) / {py:class}`~rmm.mr.CudaAsyncMemoryResource` (Python) allocates from a custom CUDA memory pool using `cudaMallocFromPoolAsync`. This is the **recommended default** for most applications. Note: This creates a *custom* mempool, not the default device mempool. A custom pool is used to enable features like Blackwell decompression engine support and custom release thresholds. -**Advantages:** +**Features:** - **Fast allocation**: Driver-managed pool reuses previously allocated memory - **Reduced fragmentation**: Virtual addressing allows non-contiguous physical memory to back contiguous allocations, unlike `PoolMemoryResource` which requires contiguous free regions - **Stream-ordered semantics**: Allocations and deallocations are stream-ordered by default, avoiding pipeline stalls in multi-stream workloads @@ -81,105 +80,71 @@ Note: This creates a *custom* mempool, not the default device mempool. A custom - Multi-stream or multi-threaded applications - Most production workloads -### CudaMemoryResource +### CUDA Memory Resource -The `CudaMemoryResource` uses the legacy `cudaMalloc`/`cudaFree` APIs directly with no pooling or stream-ordering support. It is generally not recommended. +{cpp:class}`~rmm::mr::cuda_memory_resource` (C++) / {py:class}`~rmm.mr.CudaMemoryResource` (Python) uses the legacy `cudaMalloc`/`cudaFree` APIs directly with no pooling or stream-ordering support. It is generally not recommended. **When to use:** - Debugging memory issues (to isolate allocator-related problems) - Benchmarking baseline allocation overhead -### PoolMemoryResource +### Managed Memory Resource -The `PoolMemoryResource` maintains a pool of memory allocated from an upstream resource. It provides fast suballocation but requires manual tuning for pool sizes and does not match the performance of `CudaAsyncMemoryResource` in multi-stream workloads. +{cpp:class}`~rmm::mr::managed_memory_resource` (C++) / {py:class}`~rmm.mr.ManagedMemoryResource` (Python) allocates [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) via `cudaMallocManaged`. Unified Memory creates a single address space accessible from both CPU and GPU, with the CUDA driver migrating pages between processors on demand. This enables [GPU memory oversubscription](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) — allocating more memory than physically available on the GPU — but generally comes with a performance cost. -**Advantages:** -- Fast suballocation from pre-allocated pool -- Configurable initial and maximum pool sizes for explicit memory budgeting - -**Disadvantages:** -- **Slower than async MR** in multi-stream workloads due to internal locking -- Can suffer from fragmentation (async MR reduces this with virtual addressing) -- Pool cannot be shared across CUDA applications unless all applications are using RMM -- May require tuning of pool size for optimal performance - -**When to use:** -- Explicit memory budgeting with fixed pool sizes -- Wrapping non-CUDA memory sources (e.g., managed memory) -- Prefer `CudaAsyncMemoryResource` for new code unless you need explicit pool size control - -**Note**: `PoolMemoryResource` does not return memory to the upstream resource on deallocation. Once the pool grows, that memory stays allocated until the resource is destroyed. Set `maximum_pool_size` to limit growth. - -**Example:** -```python -import rmm - -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), - initial_pool_size=2**32, # 4 GiB - maximum_pool_size=2**34 # 16 GiB -) -rmm.mr.set_current_device_resource(pool) -``` - -### ManagedMemoryResource - -The `ManagedMemoryResource` allocates [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) via `cudaMallocManaged`. Unified Memory creates a single address space accessible from both CPU and GPU, with the CUDA driver migrating pages between processors on demand. This enables [GPU memory oversubscription](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) — allocating more memory than physically available on the GPU — but generally comes with a performance cost. - -**Advantages:** +**Features:** - Enables GPU memory oversubscription for datasets larger than GPU memory - Automatic page migration between CPU and GPU -**Disadvantages:** -- **Slower than device memory** due to page faults and migration overhead, especially in multi-stream workloads (see [Performance Tuning](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#performance-tuning) in the CUDA Programming Guide) -- Requires prefetching to achieve acceptable performance (see [Managed Memory guide](managed_memory.md)) +**Caution:** +By default, managed memory adds overhead for page faults and migration (see [Performance Tuning](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#performance-tuning) in the CUDA Programming Guide). See the [Managed Memory guide](managed_memory.md) for a recommended solution with a pool and prefetching adaptor. + +**When to use:** +- Datasets larger than available GPU memory +- Typically combined with a pool and prefetching (see [Managed Memory guide](managed_memory.md)) **Example:** ```python import rmm -# Always combine managed memory with a pool and prefetching for acceptable -# performance. Without prefetching, page faults cause significant overhead, -# especially in multi-stream workloads. +# Combine managed memory with a pool and prefetching for performance. +# Without prefetching, page faults cause significant overhead. base = rmm.mr.ManagedMemoryResource() pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) -rmm.mr.set_current_device_resource(prefetch_mr) +buffer = rmm.DeviceBuffer(size=1024, mr=prefetch_mr) ``` -**When to use:** -- Datasets larger than available GPU memory -- Always combine with a pool and prefetching (see [Managed Memory guide](managed_memory.md)) +### Pool Memory Resource + +{cpp:class}`~rmm::mr::pool_memory_resource` (C++) / {py:class}`~rmm.mr.PoolMemoryResource` (Python) maintains a pool of memory allocated from an upstream resource, providing fast suballocation. -### ArenaMemoryResource +**Features:** +- Fast suballocation from pre-allocated pool +- Configurable initial and maximum pool sizes for explicit memory budgeting -The `ArenaMemoryResource` divides a large allocation into size-binned arenas, reducing fragmentation. +**When to use:** +- The [Managed Memory guide](managed_memory.md) provides a good example of usage, because initial allocations of managed memory can be slow. The pool resource amortizes that initial cost over the lifetime of the pool. -**Advantages:** -- Better fragmentation characteristics than basic pool -- Good for mixed allocation sizes -- Predictable performance +**Caution:** +There are pool implementations in both RMM (this memory resource) and in the CUDA driver (leveraging `cudaMallocFromPoolAsync` and `cudaMemPool_t`). +The RMM pool implementation is not as good at handling fragmentation compared to the CUDA driver. +Also, RMM's pool can be slower than the CUDA driver's pool implementation in heavy multi-stream workloads depending on application details. -**Disadvantages:** -- More complex configuration -- May waste memory if bin sizes don't match allocation patterns +**Note**: `PoolMemoryResource` does not return memory to the upstream resource on deallocation. Once the pool grows, that memory stays allocated until the resource is destroyed. Set `maximum_pool_size` to limit growth. **Example:** ```python import rmm -arena = rmm.mr.ArenaMemoryResource( +pool = rmm.mr.PoolMemoryResource( rmm.mr.CudaMemoryResource(), - arena_size=2**28 # 256 MiB arenas + initial_pool_size=2**32, # 4 GiB + maximum_pool_size=2**34 # 16 GiB ) -rmm.mr.set_current_device_resource(arena) +buffer = rmm.DeviceBuffer(size=1024, mr=pool) ``` -**When to use:** -- Applications with diverse allocation sizes -- Long-running services with complex allocation patterns -- When fragmentation is observed with pool allocators - ## Composing Memory Resources Memory resources can be composed (wrapped) to combine their properties. The general pattern is: @@ -199,27 +164,27 @@ import rmm base = rmm.mr.ManagedMemoryResource() pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) prefetch = rmm.mr.PrefetchResourceAdaptor(pool) -rmm.mr.set_current_device_resource(prefetch) +buffer = rmm.DeviceBuffer(size=1024, mr=prefetch) ``` -**Statistics tracking:** +**Statistics tracking** (see [Logging and Profiling](logging.md)): ```python import rmm # Track allocation statistics (counts, peak, and total bytes) base = rmm.mr.CudaAsyncMemoryResource() -stats = rmm.mr.StatisticsResourceAdaptor(base) -rmm.mr.set_current_device_resource(stats) +stats_mr = rmm.mr.StatisticsResourceAdaptor(base) +buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) ``` -**Allocation logging:** +**Allocation logging** (see [Logging and Profiling](logging.md)): ```python import rmm # Log every allocation and deallocation to a file base = rmm.mr.CudaAsyncMemoryResource() -logged = rmm.mr.LoggingResourceAdaptor(base, log_file_name="allocations.csv") -rmm.mr.set_current_device_resource(logged) +logging_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="allocations.csv") +buffer = rmm.DeviceBuffer(size=1024, mr=logging_mr) ``` ## Multi-Library Applications @@ -254,7 +219,7 @@ With this setup, both PyTorch and any other RMM-configured library (like cuDF) a rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) ``` -2. **Use adaptors for diagnostics**: Wrap with `StatisticsResourceAdaptor` to track allocation counts and peak usage, or `LoggingResourceAdaptor` to log every allocation and deallocation (see [Logging and Profiling](logging.md)). +2. **Use adaptors for diagnostics**: Wrap with {cpp:class}`~rmm::mr::statistics_resource_adaptor` (C++) / {py:class}`~rmm.mr.StatisticsResourceAdaptor` (Python) to track allocation counts and peak usage, or {cpp:class}`~rmm::mr::logging_resource_adaptor` (C++) / {py:class}`~rmm.mr.LoggingResourceAdaptor` (Python) to log every allocation and deallocation (see [Logging and Profiling](logging.md)). ## See Also diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index dc46a1a3c..110f4665f 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -90,7 +90,7 @@ mr = rmm.mr.get_current_device_resource() ### Available Resources -RMM provides base memory resources (e.g., `CudaAsyncMemoryResource`, `ManagedMemoryResource`) and resource adaptors (e.g., `PoolMemoryResource`, `StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the API references ([C++](../cpp/memory_resources/index.md), [Python](../python/index.md)) for the full list. +RMM provides base memory resources (e.g., {py:class}`~rmm.mr.CudaAsyncMemoryResource`, {py:class}`~rmm.mr.ManagedMemoryResource`) and resource adaptors (e.g., {py:class}`~rmm.mr.PoolMemoryResource`, {py:class}`~rmm.mr.StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the API references ([C++ memory resources](../cpp/memory_resources/memory_resources.md), [C++ adaptors](../cpp/memory_resources/memory_resource_adaptors.md), [Python](../python/mr.md)) for the full list. ## Containers @@ -98,7 +98,7 @@ RMM provides RAII containers that automatically manage device memory lifetime. ### DeviceBuffer -Untyped, uninitialized device memory: +Untyped, uninitialized device memory ({cpp:class}`C++ `, {py:class}`Python `): `````{tabs} ````{code-tab} c++ @@ -139,7 +139,7 @@ buffer2 = buffer.copy() ### device_uvector (C++) -Typed, uninitialized device vector for trivially copyable types: +Typed, uninitialized device vector for trivially copyable types ({cpp:class}`API `): ```cpp #include @@ -167,7 +167,7 @@ vec.resize(200, stream.view()); ### device_scalar (C++) -Single typed element with host-device transfer convenience: +Single typed element with host-device transfer convenience ({cpp:class}`API `): ```cpp #include @@ -312,7 +312,7 @@ Order matters: outer adaptors see all allocations from inner resources. ### Thrust (C++) -Use `rmm::exec_policy_nosync` to make Thrust algorithms use RMM for temporary storage. Passing the resource explicitly makes it clear which resource handles temporaries: +Use {cpp:class}`rmm::exec_policy_nosync` to make Thrust algorithms use RMM for temporary storage. Passing the resource explicitly makes it clear which resource handles temporaries: ```cpp #include @@ -339,7 +339,7 @@ stream.synchronize(); ### CuPy (Python) -Configure CuPy to use RMM for all device memory allocations: +Configure CuPy to use RMM for all device memory allocations ({py:func}`API `): ```python import rmm @@ -359,7 +359,7 @@ array = cp.zeros(1000) ### Numba (Python) -Configure Numba to use RMM for device memory in CUDA JIT-compiled functions: +Configure Numba to use RMM for device memory in CUDA JIT-compiled functions ({py:class}`API `): ```python from numba import cuda @@ -382,7 +382,7 @@ NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python script.py ### PyTorch (Python) -Configure PyTorch to use RMM for CUDA tensor allocations: +Configure PyTorch to use RMM for CUDA tensor allocations ({py:func}`API `): ```python import rmm diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index 800872a25..61c275dd9 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -18,6 +18,8 @@ Install the latest stable release: conda install -c rapidsai -c conda-forge rmm cuda-version=13 ``` +The `cuda-version` metapackage selects the CUDA Toolkit major version, and requires a CUDA driver to be installed from that major version or newer. + ### Nightly Builds For the latest development version, install from the nightly channel: @@ -26,7 +28,7 @@ For the latest development version, install from the nightly channel: conda install -c rapidsai-nightly -c conda-forge rmm cuda-version=13 ``` -Nightly builds are created from the `main` branch and may contain unreleased features or bug fixes. +Nightly builds are created from the `main` branch and may contain unreleased features or bug fixes. They provide no stability guarantees. ## Installing with pip diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md index 9ce382e40..88b0b4978 100644 --- a/docs/user_guide/introduction.md +++ b/docs/user_guide/introduction.md @@ -10,20 +10,20 @@ RMM provides integrations with GPU libraries including cuDF, cuML, cuGraph, PyTo ### Memory Resources -A memory resource is an object that knows how to allocate and deallocate memory. The choice of resource determines the kind of memory (device, host, managed, pinned) and the allocation strategy (pooled, stream-ordered, etc.). RMM's resources implement the `cuda::mr::resource` concept defined by [CCCL](https://github.com/NVIDIA/cccl) (CUDA Core Compute Libraries), so they interoperate directly with any library that accepts CCCL resources. +A memory resource is an object that knows how to allocate and deallocate memory. The choice of resource determines the kind of memory (device, host, managed, pinned) and the allocation strategy (pooled, stream-ordered, etc.). RMM's resources implement the `cuda::mr::resource` concept defined by [CCCL](https://github.com/NVIDIA/cccl) (CUDA Core Compute Libraries), so they interoperate directly with any library that accepts CCCL resources. See the API references for the full list ([C++ memory resources](../cpp/memory_resources/memory_resources.md), [C++ adaptors](../cpp/memory_resources/memory_resource_adaptors.md), [Python](../python/mr.md)). -For most applications, the CUDA async memory resource (`rmm::mr::cuda_async_memory_resource` in C++, `rmm.mr.CudaAsyncMemoryResource` in Python) is a good starting point — it uses a CUDA driver-managed pool and supports stream-ordered (asynchronous) allocations. See [Choosing a Memory Resource](choosing_memory_resources.md) for guidance on when to use other resources. +For most applications, the CUDA async memory resource ({cpp:class}`~rmm::mr::cuda_async_memory_resource` in C++, {py:class}`~rmm.mr.CudaAsyncMemoryResource` in Python) is a good starting point — it uses a CUDA driver-managed pool and supports stream-ordered (asynchronous) allocations. See [Choosing a Memory Resource](choosing_memory_resources.md) for guidance on when to use other resources. ### Resource Adaptors -Resource adaptors wrap an existing resource to add functionality. For example, `StatisticsResourceAdaptor` tracks allocation statistics, and `LoggingResourceAdaptor` logs allocations to a CSV file. Adaptors are composable — you can stack several to get combined functionality. See [Logging and Profiling](logging.md) for details. +Resource adaptors wrap an existing resource to add functionality. For example, {py:class}`~rmm.mr.StatisticsResourceAdaptor` tracks allocation statistics, and {py:class}`~rmm.mr.LoggingResourceAdaptor` logs allocations to a CSV file. Adaptors are composable — you can stack several to get combined functionality. See [Logging and Profiling](logging.md) for details and the API references for the full list ([C++](../cpp/memory_resources/memory_resource_adaptors.md), [Python](../python/mr.md)). ### Containers RMM provides [RAII](https://en.cppreference.com/w/cpp/language/raii.html) containers that manage device memory lifetime, avoiding common problems like memory leaks or improper stream ordering: -- C++: `device_buffer` (untyped), `device_uvector` (typed, uninitialized), `device_scalar` (single element) -- Python: `DeviceBuffer` (untyped) +- C++: {cpp:class}`~rmm::device_buffer` (untyped), {cpp:class}`~rmm::device_uvector` (typed, uninitialized), {cpp:class}`~rmm::device_scalar` (single element) +- Python: {py:class}`~rmm.DeviceBuffer` (untyped) All containers accept a stream and a memory resource, and use stream-ordered allocation. diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md index 287bf55f4..ccac80974 100644 --- a/docs/user_guide/logging.md +++ b/docs/user_guide/logging.md @@ -1,6 +1,10 @@ # Logging and Profiling -RMM provides two types of logging: **memory event logging** for tracking allocations and deallocations, and **debug logging** for troubleshooting internal behavior. +RMM provides adaptors for tracking memory allocations and deallocations. + +The {cpp:class}`~rmm::mr::logging_resource_adaptor` / {py:class}`~rmm.mr.LoggingResourceAdaptor` will produce a CSV file of all allocations/deallocations with timestamps and stream IDs. + +The {cpp:class}`~rmm::mr::statistics_resource_adaptor` / {py:class}`~rmm.mr.StatisticsResourceAdaptor`, and {py:mod}`rmm.statistics`, can be used to track allocation statistics such as peak memory and total memory. ## Memory Event Logging diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md index 60b604ca6..fbaecdfbd 100644 --- a/docs/user_guide/managed_memory.md +++ b/docs/user_guide/managed_memory.md @@ -2,7 +2,7 @@ CUDA Managed Memory (also called Unified Memory) provides a single address space accessible from both CPU and GPU. The CUDA driver migrates pages between host and device memory on demand, which means you can work with datasets larger than GPU memory or share data between host and device code without explicit copies. -RMM's `ManagedMemoryResource` allocates managed memory via `cudaMallocManaged`. For background on how Unified Memory works at the driver level, see the [CUDA Programming Guide: Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#unified-memory-programming). +RMM's {cpp:class}`~rmm::mr::managed_memory_resource` (C++) / {py:class}`~rmm.mr.ManagedMemoryResource` (Python) allocates managed memory via `cudaMallocManaged`. For background on how Unified Memory works at the driver level, see the [CUDA Programming Guide: Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#unified-memory-programming). The main trade-off is performance: on-demand page migration introduces latency from page faults. For production workloads, combining managed memory with prefetching (described below) is essential to avoid this overhead. @@ -14,7 +14,7 @@ Prefetching migrates data to the GPU ahead of time so that kernels find it alrea ### Prefetch on Allocate (Eager) -`PrefetchResourceAdaptor` wraps another resource and prefetches each allocation to the current device as soon as it's made. This works well when data is used on the GPU shortly after allocation, such as when copying or writing to the new allocation: +{cpp:class}`~rmm::mr::prefetch_resource_adaptor` (C++) / {py:class}`~rmm.mr.PrefetchResourceAdaptor` (Python) wraps another resource and prefetches each allocation to the current device as soon as it's made. This works well when data is used on the GPU shortly after allocation, such as when copying or writing to the new allocation: ```python import rmm diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md index f5f93505f..b025f3317 100644 --- a/docs/user_guide/stream_ordered_allocation.md +++ b/docs/user_guide/stream_ordered_allocation.md @@ -1,6 +1,6 @@ # Stream-Ordered Memory Allocation -RMM containers and memory resources are stream-ordered: allocations and deallocations are enqueued on a CUDA stream rather than blocking the CPU. This lets memory operations overlap with kernel execution and avoids the synchronization cost of `cudaMalloc`/`cudaFree`. For background on CUDA streams and asynchronous execution, see the [CUDA Programming Guide: Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/asynchronous-execution.html#what-is-asynchronous-concurrent-execution). +RMM containers ({cpp:class}`~rmm::device_buffer`, {py:class}`~rmm.DeviceBuffer`) and [memory resources](../python/mr.md) are stream-ordered: allocations and deallocations are enqueued on a CUDA stream rather than blocking the CPU. This lets memory operations overlap with kernel execution and avoids the synchronization cost of `cudaMalloc`/`cudaFree`. For background on CUDA streams and asynchronous execution, see the [CUDA Programming Guide: Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/asynchronous-execution.html#what-is-asynchronous-concurrent-execution). ## How It Works @@ -190,11 +190,11 @@ del buffer ## Which Resources Support Stream Ordering? -- **`CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) -- **`PoolMemoryResource`**: Internally stream-safe — suballocations are mutex-protected, independent of upstream -- **`ArenaMemoryResource`**: Internally stream-safe — uses per-stream arenas, independent of upstream -- **`CudaMemoryResource`**: NOT stream-ordered (`cudaMalloc` is synchronous) -- **`ManagedMemoryResource`**: NOT stream-ordered (`cudaMallocManaged` is synchronous) +- **{py:class}`~rmm.mr.CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) +- **{py:class}`~rmm.mr.PoolMemoryResource`**: Internally stream-safe — suballocations are mutex-protected, independent of upstream +- **{py:class}`~rmm.mr.ArenaMemoryResource`**: Internally stream-safe — uses per-stream arenas, independent of upstream +- **{py:class}`~rmm.mr.CudaMemoryResource`**: NOT stream-ordered (`cudaMalloc` is synchronous) +- **{py:class}`~rmm.mr.ManagedMemoryResource`**: NOT stream-ordered (`cudaMallocManaged` is synchronous) ## Example: Numba Kernel with RMM Stream From 2f9d48c0ef5c577f6ff0fa526275b33b627d5874 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Tue, 14 Apr 2026 22:54:32 +0000 Subject: [PATCH 24/24] Extract user guide code examples into runnable source files with literalinclude Replace inline code blocks in all user guide pages with literalinclude directives pointing to standalone source files that are compiled and tested in CI. C++ examples live in cpp/examples/docs/src/ (.cpp for host-only, .cu for files with CUDA kernels) and are built via the existing examples infrastructure. Python examples live in python/rmm/rmm/tests/examples/ and are tested via a new test_doc_examples.py that runs each script as a subprocess. Markdown files use YAML --- option blocks for all literalinclude directives to work around a MyST v5.0.0 bug where # at the start of :key: value options is treated as a comment. --- cpp/examples/README.md | 1 + cpp/examples/build.sh | 3 +- cpp/examples/docs/CMakeLists.txt | 39 ++ .../docs/src/choosing_memory_resources.cpp | 72 +++ cpp/examples/docs/src/guide.cu | 312 ++++++++++++ cpp/examples/docs/src/installation.cpp | 37 ++ cpp/examples/docs/src/introduction.cpp | 34 ++ cpp/examples/docs/src/logging.cpp | 124 +++++ cpp/examples/docs/src/managed_memory.cu | 55 +++ .../docs/src/stream_ordered_allocation.cu | 143 ++++++ docs/user_guide/choosing_memory_resources.md | 189 ++++---- docs/user_guide/guide.md | 458 +++++++----------- docs/user_guide/installation.md | 36 +- docs/user_guide/introduction.md | 29 +- docs/user_guide/logging.md | 435 ++++++----------- docs/user_guide/managed_memory.md | 79 ++- docs/user_guide/stream_ordered_allocation.md | 249 +++------- .../examples/choosing_memory_resources.py | 172 +++++++ python/rmm/rmm/tests/examples/guide.py | 257 ++++++++++ python/rmm/rmm/tests/examples/installation.py | 24 + python/rmm/rmm/tests/examples/introduction.py | 13 + python/rmm/rmm/tests/examples/logging.py | 387 +++++++++++++++ .../rmm/rmm/tests/examples/managed_memory.py | 65 +++ .../examples/stream_ordered_allocation.py | 149 ++++++ python/rmm/rmm/tests/test_doc_examples.py | 29 ++ 25 files changed, 2462 insertions(+), 929 deletions(-) create mode 100644 cpp/examples/docs/CMakeLists.txt create mode 100644 cpp/examples/docs/src/choosing_memory_resources.cpp create mode 100644 cpp/examples/docs/src/guide.cu create mode 100644 cpp/examples/docs/src/installation.cpp create mode 100644 cpp/examples/docs/src/introduction.cpp create mode 100644 cpp/examples/docs/src/logging.cpp create mode 100644 cpp/examples/docs/src/managed_memory.cu create mode 100644 cpp/examples/docs/src/stream_ordered_allocation.cu create mode 100644 python/rmm/rmm/tests/examples/choosing_memory_resources.py create mode 100644 python/rmm/rmm/tests/examples/guide.py create mode 100644 python/rmm/rmm/tests/examples/installation.py create mode 100644 python/rmm/rmm/tests/examples/introduction.py create mode 100644 python/rmm/rmm/tests/examples/logging.py create mode 100644 python/rmm/rmm/tests/examples/managed_memory.py create mode 100644 python/rmm/rmm/tests/examples/stream_ordered_allocation.py create mode 100644 python/rmm/rmm/tests/test_doc_examples.py diff --git a/cpp/examples/README.md b/cpp/examples/README.md index 138f3ac0e..4f0624478 100644 --- a/cpp/examples/README.md +++ b/cpp/examples/README.md @@ -5,3 +5,4 @@ This folder contains examples to demonstrate librmm use cases. Running `build.sh Current examples: - Basic: demonstrates memory resource construction and allocating a `device_uvector` on a stream. +- Docs: code examples from the [user guide](../../docs/user_guide/), included via `literalinclude` directives. diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 0545b1e35..cfbb9f8cf 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # librmm examples build script @@ -58,3 +58,4 @@ build_example() { } build_example basic +build_example docs diff --git a/cpp/examples/docs/CMakeLists.txt b/cpp/examples/docs/CMakeLists.txt new file mode 100644 index 000000000..ee3ffdf12 --- /dev/null +++ b/cpp/examples/docs/CMakeLists.txt @@ -0,0 +1,39 @@ +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on + +cmake_minimum_required(VERSION 3.30.4) + +include(../set_cuda_architecture.cmake) + +# initialize CUDA architectures +rapids_cuda_init_architectures(docs_examples) + +project( + docs_examples + VERSION 0.0.1 + LANGUAGES CXX CUDA) + +include(../fetch_dependencies.cmake) + +include(rapids-cmake) +rapids_cmake_build_type("Release") + +# One executable per user guide page +set(DOCS_CUDA_EXAMPLES guide stream_ordered_allocation managed_memory) +set(DOCS_CXX_EXAMPLES introduction choosing_memory_resources logging installation) + +foreach(example ${DOCS_CUDA_EXAMPLES}) + add_executable(docs_${example} src/${example}.cu) + target_link_libraries(docs_${example} PRIVATE rmm::rmm) + target_compile_features(docs_${example} PRIVATE cxx_std_17) + install(TARGETS docs_${example} DESTINATION bin/examples/librmm) +endforeach() + +foreach(example ${DOCS_CXX_EXAMPLES}) + add_executable(docs_${example} src/${example}.cpp) + target_link_libraries(docs_${example} PRIVATE rmm::rmm) + target_compile_features(docs_${example} PRIVATE cxx_std_17) + install(TARGETS docs_${example} DESTINATION bin/examples/librmm) +endforeach() diff --git a/cpp/examples/docs/src/choosing_memory_resources.cpp b/cpp/examples/docs/src/choosing_memory_resources.cpp new file mode 100644 index 000000000..d8bb33f1d --- /dev/null +++ b/cpp/examples/docs/src/choosing_memory_resources.cpp @@ -0,0 +1,72 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/choosing_memory_resources.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +void recommended_default() +{ + // clang-format off + // [recommended-default] + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), mr); + // [/recommended-default] + // clang-format on + + assert(buffer.size() == 1024); +} + +void managed_pool_prefetch() +{ + // clang-format off + // [managed-pool-prefetch] + #include + #include + #include + #include + #include + + // Use 80% of GPU memory, rounded down to nearest 256 bytes + auto [free_memory, total_memory] = rmm::available_device_memory(); + auto pool_size = rmm::align_down(static_cast(total_memory * 0.8), 256); + + rmm::mr::managed_memory_resource managed_mr; + rmm::mr::pool_memory_resource pool_mr{managed_mr, pool_size}; + rmm::mr::prefetch_resource_adaptor prefetch_mr{pool_mr}; + // [/managed-pool-prefetch] + // clang-format on + + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), prefetch_mr); + assert(buffer.size() == 1024); +} + +int main() +{ + recommended_default(); + managed_pool_prefetch(); + + std::cout << "All choosing_memory_resources examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/guide.cu b/cpp/examples/docs/src/guide.cu new file mode 100644 index 000000000..677914a74 --- /dev/null +++ b/cpp/examples/docs/src/guide.cu @@ -0,0 +1,312 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/guide.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +__global__ void trivial_kernel(int* data) { data[0] = 42; } + +void explicit_resource() +{ + // clang-format off + // [explicit-resource] + rmm::mr::cuda_async_memory_resource async_mr; + rmm::cuda_stream stream; + + // Pass the resource explicitly + rmm::device_buffer buffer(1024, stream.view(), async_mr); + // [/explicit-resource] + // clang-format on + + assert(buffer.size() == 1024); +} + +void current_device_resource() +{ + // clang-format off + // [current-device-resource] + #include + #include + + rmm::mr::cuda_async_memory_resource async_mr; + rmm::mr::set_current_device_resource_ref(async_mr); + + // Allocations that don't specify a resource use the current device resource + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); + // [/current-device-resource] + // clang-format on + + (void)mr; +} + +void device_buffer_example() +{ + // clang-format off + // [device-buffer] + #include + + rmm::cuda_stream stream; + + // Allocate 1024 bytes + rmm::device_buffer buffer(1024, stream.view()); + + // Access pointer and size + void* ptr = buffer.data(); + std::size_t size = buffer.size(); + + // Resize (may reallocate) + buffer.resize(2048, stream.view()); + + // Copy construct (deep copy) + rmm::device_buffer buffer2(buffer, stream.view()); + // [/device-buffer] + // clang-format on + + assert(buffer.size() == 2048); + assert(buffer2.size() == 2048); + (void)ptr; + (void)size; +} + +void device_uvector_example() +{ + // clang-format off + // [device-uvector] + #include + #include + #include + + rmm::cuda_stream stream; + + // Allocate 100 elements + rmm::device_uvector vec(100, stream.view()); + + // Access as pointer + int* ptr = vec.data(); + + // Access as iterators + auto begin = vec.begin(); + auto end = vec.end(); + + // Initialize with Thrust + thrust::fill(rmm::exec_policy(stream.view()), vec.begin(), vec.end(), 42); + + // Resize + vec.resize(200, stream.view()); + // [/device-uvector] + // clang-format on + + assert(vec.size() == 200); + (void)ptr; + (void)begin; + (void)end; +} + +void device_scalar_example() +{ + // clang-format off + // [device-scalar] + #include + + rmm::cuda_stream stream; + + // Allocate single int + rmm::device_scalar scalar(stream.view()); + + // Set value from host (async on stream) + scalar.set_value(42, stream.view()); + + // Get value to host (async on stream) + int value = scalar.value(stream.view()); + + // Access device pointer + int* d_ptr = scalar.data(); + + // Pass to kernel + trivial_kernel<<<1, 1, 0, stream.value()>>>(scalar.data()); + // [/device-scalar] + // clang-format on + + stream.synchronize(); + assert(value == 42); + (void)d_ptr; +} + +void statistics_tracking() +{ + // clang-format off + // [statistics-tracking] + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + + // Allocate using the statistics-wrapped resource + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), stats_mr); + + // Get statistics + auto bytes = stats_mr.get_bytes_counter(); + std::cout << "Current bytes: " << bytes.value << "\n"; + std::cout << "Peak bytes: " << bytes.peak << "\n"; + std::cout << "Total bytes: " << bytes.total << "\n"; + // [/statistics-tracking] + // clang-format on +} + +void logging_example() +{ + // clang-format off + // [logging] + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "allocations.csv"}; + + // Allocations through log_mr are logged to CSV + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), log_mr); + // [/logging] + // clang-format on + + assert(buffer.size() == 1024); + std::remove("allocations.csv"); +} + +void composing_resources() +{ + // clang-format off + // [composing-resources] + #include + #include + #include + #include + + // Base resource + rmm::mr::cuda_memory_resource cuda_mr; + + // Add pool + rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; + + // Add statistics + rmm::mr::statistics_resource_adaptor stats_mr{pool_mr}; + + // Add logging + rmm::mr::logging_resource_adaptor log_mr{stats_mr, "log.csv"}; + + // Use log_mr for allocations — all allocations are pooled, tracked, and logged + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), log_mr); + // [/composing-resources] + // clang-format on + + assert(buffer.size() == 1024); + std::remove("log.csv"); +} + +void thrust_example() +{ + // clang-format off + // [thrust] + #include + #include + #include + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_uvector vec(1000, stream.view(), mr); + + // Fill with descending values + thrust::sequence(rmm::exec_policy_nosync(stream.view(), mr), + vec.begin(), vec.end(), vec.size() - 1, -1); + + // Sort — temporaries allocated from mr + thrust::sort(rmm::exec_policy_nosync(stream.view(), mr), vec.begin(), vec.end()); + + stream.synchronize(); + // [/thrust] + // clang-format on +} + +void multi_device_example() +{ + // clang-format off + // [multi-device] + #include + #include + #include + #include + + int num_devices; + cudaGetDeviceCount(&num_devices); + + // Store resources to maintain lifetime (resources are copyable value types) + std::vector resources; + + for (int i = 0; i < num_devices; ++i) { + // Set device BEFORE creating resource + cudaSetDevice(i); + + // Create resource for this device + resources.emplace_back(); + + // Set as per-device resource ref + rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{i}, resources.back()); + } + + // Use device 0 + cudaSetDevice(0); + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view()); // Uses device 0's resource + // [/multi-device] + // clang-format on + + assert(buffer.size() == 1024); +} + +int main() +{ + explicit_resource(); + current_device_resource(); + device_buffer_example(); + device_uvector_example(); + device_scalar_example(); + statistics_tracking(); + logging_example(); + composing_resources(); + thrust_example(); + multi_device_example(); + + std::cout << "All guide examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/installation.cpp b/cpp/examples/docs/src/installation.cpp new file mode 100644 index 000000000..a9dc5cfce --- /dev/null +++ b/cpp/examples/docs/src/installation.cpp @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/installation.md + +#include +#include +#include +#include + +#include + +void test_installation() +{ + // clang-format off + // [test-installation] + #include + #include + #include + #include + + auto mr = rmm::mr::cuda_memory_resource{}; + rmm::mr::set_current_device_resource_ref(mr); + + rmm::device_buffer buf(100, rmm::cuda_stream_view{}); + std::cout << "Allocated " << buf.size() << " bytes\n"; + // [/test-installation] + // clang-format on +} + +int main() +{ + test_installation(); + return 0; +} diff --git a/cpp/examples/docs/src/introduction.cpp b/cpp/examples/docs/src/introduction.cpp new file mode 100644 index 000000000..dbf399e26 --- /dev/null +++ b/cpp/examples/docs/src/introduction.cpp @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/introduction.md + +#include +#include +#include + +#include +#include + +void basic_example() +{ + // clang-format off + // [basic-example] + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), mr); + // [/basic-example] + // clang-format on + + assert(buffer.size() == 1024); +} + +int main() +{ + basic_example(); + + std::cout << "All introduction examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/logging.cpp b/cpp/examples/docs/src/logging.cpp new file mode 100644 index 000000000..ac736ac0a --- /dev/null +++ b/cpp/examples/docs/src/logging.cpp @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/logging.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +void logging_adaptor() +{ + // clang-format off + // [logging-adaptor] + #include + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "memory_log.csv"}; + + // Allocations through log_mr are logged to CSV + rmm::cuda_stream stream; + rmm::device_buffer buf1(1024, stream.view(), log_mr); + rmm::device_buffer buf2(2048, stream.view(), log_mr); + // [/logging-adaptor] + // clang-format on + + std::remove("memory_log.csv"); +} + +void statistics_adaptor() +{ + // clang-format off + // [statistics-adaptor] + #include + #include + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + + // Allocate using the statistics-wrapped resource + rmm::cuda_stream stream; + rmm::device_buffer buf1(1024, stream.view(), stats_mr); + rmm::device_buffer buf2(2048, stream.view(), stats_mr); + + // Get statistics + auto bytes = stats_mr.get_bytes_counter(); + auto allocs = stats_mr.get_allocations_counter(); + std::cout << "Current bytes: " << bytes.value << "\n"; + std::cout << "Peak bytes: " << bytes.peak << "\n"; + std::cout << "Allocation count: " << allocs.value << "\n"; + // [/statistics-adaptor] + // clang-format on +} + +void debug_log_level() +{ + // clang-format off + // [debug-log-level] + #include + + rmm::default_logger().set_level(rapids_logger::level_enum::trace); + // [/debug-log-level] + // clang-format on + + // Reset to default + rmm::default_logger().set_level(rapids_logger::level_enum::info); +} + +void combining_features() +{ + // clang-format off + // [combining-features] + #include + #include + #include + #include + + // Set debug log level + rmm::default_logger().set_level(rapids_logger::level_enum::debug); + + // Build resource stack: statistics + logging + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + rmm::mr::logging_resource_adaptor log_mr{stats_mr, "events.csv"}; + + // All allocations through log_mr are tracked and logged + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), log_mr); + + // Get statistics + auto bytes = stats_mr.get_bytes_counter(); + std::cout << "Peak bytes: " << bytes.peak << "\n"; + // [/combining-features] + // clang-format on + + // Reset to default + rmm::default_logger().set_level(rapids_logger::level_enum::info); + std::remove("events.csv"); +} + +int main() +{ + logging_adaptor(); + statistics_adaptor(); + debug_log_level(); + combining_features(); + + std::cout << "All logging examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/managed_memory.cu b/cpp/examples/docs/src/managed_memory.cu new file mode 100644 index 000000000..9adcd2568 --- /dev/null +++ b/cpp/examples/docs/src/managed_memory.cu @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/managed_memory.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include + +#include +#include + +__global__ void trivial_kernel(void* data) {} + +void prefetch_on_access() +{ + dim3 grid(1), block(1); + + // clang-format off + // [prefetch-on-access] + #include + #include + #include + + rmm::mr::managed_memory_resource managed_mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1000000, stream.view(), managed_mr); + + // Prefetch to the current device on this stream + rmm::prefetch(buffer.data(), buffer.size(), + rmm::get_current_cuda_device(), stream.view()); + + // Kernel on the same stream finds the data already resident + trivial_kernel<<>>(buffer.data()); + // [/prefetch-on-access] + // clang-format on + + stream.synchronize(); +} + +int main() +{ + prefetch_on_access(); + + std::cout << "All managed_memory examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/stream_ordered_allocation.cu b/cpp/examples/docs/src/stream_ordered_allocation.cu new file mode 100644 index 000000000..c85e1dabc --- /dev/null +++ b/cpp/examples/docs/src/stream_ordered_allocation.cu @@ -0,0 +1,143 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/stream_ordered_allocation.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include + +#include +#include +#include +#include + +__global__ void trivial_kernel(void* data) {} + +void how_it_works() +{ + dim3 grid(1), block(1); + + // clang-format off + // [how-it-works] + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1000, stream.view(), mr); + + // buffer.data() is usable immediately in stream-ordered operations + trivial_kernel<<>>(buffer.data()); + // [/how-it-works] + // clang-format on + + stream.synchronize(); +} + +void reading_results() +{ + // clang-format off + // [reading-results] + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer d_buf(1000 * sizeof(float), stream.view(), mr); + + // Launch kernel that writes to d_buf on stream ... + + // Copy results to host on the same stream + std::vector h_buf(1000); + cudaMemcpyAsync(h_buf.data(), d_buf.data(), d_buf.size(), + cudaMemcpyDeviceToHost, stream.value()); + + // Synchronize before reading h_buf on the CPU + stream.synchronize(); + // [/reading-results] + // clang-format on +} + +void cross_stream() +{ + dim3 grid(1), block(1); + + // clang-format off + // [cross-stream] + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream_a; + rmm::cuda_stream stream_b; + + rmm::device_buffer buffer(1000, stream_a.view(), mr); + + // Record an event after the allocation on stream_a + cudaEvent_t event; + cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + cudaEventRecord(event, stream_a.value()); + + // stream_b waits for the event — no CPU synchronization needed + cudaStreamWaitEvent(stream_b.value(), event); + + // Now safe to use buffer.data() in operations on stream_b + trivial_kernel<<>>(buffer.data()); + + cudaEventDestroy(event); + // [/cross-stream] + // clang-format on + + stream_b.synchronize(); +} + +void buffer_lifetime() +{ + dim3 grid(1), block(1); + + // clang-format off + // [buffer-lifetime] + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream_a; + rmm::cuda_stream stream_b; + + rmm::device_buffer buffer(1000, stream_a.view(), mr); + + // Make stream_b wait for the allocation on stream_a + cudaEvent_t alloc_event; + cudaEventCreateWithFlags(&alloc_event, cudaEventDisableTiming); + cudaEventRecord(alloc_event, stream_a.value()); + cudaStreamWaitEvent(stream_b.value(), alloc_event); + + // Use buffer on stream_b + trivial_kernel<<>>(buffer.data()); + + // Before destroying buffer, make stream_a wait for stream_b's work + cudaEvent_t done_event; + cudaEventCreateWithFlags(&done_event, cudaEventDisableTiming); + cudaEventRecord(done_event, stream_b.value()); + cudaStreamWaitEvent(stream_a.value(), done_event); + + // Now safe to destroy buffer — deallocation on stream_a is ordered after the kernel on stream_b + buffer = rmm::device_buffer{}; + + cudaEventDestroy(alloc_event); + cudaEventDestroy(done_event); + // [/buffer-lifetime] + // clang-format on +} + +int main() +{ + how_it_works(); + reading_results(); + cross_stream(); + buffer_lifetime(); + + std::cout << "All stream_ordered_allocation examples passed.\n"; + return 0; +} diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md index 64c27611b..04a8acd73 100644 --- a/docs/user_guide/choosing_memory_resources.md +++ b/docs/user_guide/choosing_memory_resources.md @@ -9,53 +9,50 @@ This guide recommends memory resources based on optimal allocation performance f For most applications, the CUDA async memory pool provides the best allocation performance with no tuning required. `````{tabs} -````{code-tab} c++ -#include -#include - -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view(), mr); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/choosing_memory_resources.cpp +--- +language: cpp +start-after: "// [recommended-default]" +end-before: "// [/recommended-default]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -mr = rmm.mr.CudaAsyncMemoryResource() -buffer = rmm.DeviceBuffer(size=1024, mr=mr) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [recommended-default]" +end-before: "# [/recommended-default]" +dedent: +--- +``` ```` ````` For applications that require GPU memory oversubscription (allocating more memory than physically available on the GPU), use a pooled managed memory resource with prefetching. This uses [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) (`cudaMallocManaged`) to enable automatic page migration between CPU and GPU at the cost of slower allocation performance. Coupling the managed memory "base" allocator with adaptors for pool allocation and prefetching to device on allocation recovers some of the performance lost to the overhead of managed allocations. Note: Managed memory has [limited support on WSL2](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#unified-memory-on-windows-wsl-and-tegra). `````{tabs} -````{code-tab} c++ -#include -#include -#include -#include -#include - -// Use 80% of GPU memory, rounded down to nearest 256 bytes -auto [free_memory, total_memory] = rmm::available_device_memory(); -auto pool_size = rmm::align_down(static_cast(total_memory * 0.8), 256); - -rmm::mr::managed_memory_resource managed_mr; -rmm::mr::pool_memory_resource pool_mr{managed_mr, pool_size}; -rmm::mr::prefetch_resource_adaptor prefetch_mr{pool_mr}; +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/choosing_memory_resources.cpp +--- +language: cpp +start-after: "// [managed-pool-prefetch]" +end-before: "// [/managed-pool-prefetch]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -# Use 80% of GPU memory, rounded down to nearest 256 bytes -free_memory, total_memory = rmm.mr.available_device_memory() -pool_size = int(total_memory * 0.8) // 256 * 256 - -mr = rmm.mr.PrefetchResourceAdaptor( - rmm.mr.PoolMemoryResource( - rmm.mr.ManagedMemoryResource(), - initial_pool_size=pool_size, - ) -) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [managed-pool-prefetch]" +end-before: "# [/managed-pool-prefetch]" +dedent: +--- +``` ```` ````` @@ -104,15 +101,14 @@ By default, managed memory adds overhead for page faults and migration (see [Per - Typically combined with a pool and prefetching (see [Managed Memory guide](managed_memory.md)) **Example:** -```python -import rmm - -# Combine managed memory with a pool and prefetching for performance. -# Without prefetching, page faults cause significant overhead. -base = rmm.mr.ManagedMemoryResource() -pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) -buffer = rmm.DeviceBuffer(size=1024, mr=prefetch_mr) + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [managed-memory-example]" +end-before: "# [/managed-memory-example]" +dedent: +--- ``` ### Pool Memory Resource @@ -134,57 +130,62 @@ Also, RMM's pool can be slower than the CUDA driver's pool implementation in hea **Note**: `PoolMemoryResource` does not return memory to the upstream resource on deallocation. Once the pool grows, that memory stays allocated until the resource is destroyed. Set `maximum_pool_size` to limit growth. **Example:** -```python -import rmm - -pool = rmm.mr.PoolMemoryResource( - rmm.mr.CudaMemoryResource(), - initial_pool_size=2**32, # 4 GiB - maximum_pool_size=2**34 # 16 GiB -) -buffer = rmm.DeviceBuffer(size=1024, mr=pool) + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [pool-memory-example]" +end-before: "# [/pool-memory-example]" +dedent: +--- ``` ## Composing Memory Resources Memory resources can be composed (wrapped) to combine their properties. The general pattern is: -```python -# Adaptor wrapping a base resource -adaptor = rmm.mr.SomeAdaptor(base_resource) +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [composing-adaptor]" +end-before: "# [/composing-adaptor]" +dedent: +--- ``` ### Common Compositions **Prefetching with managed memory:** -```python -import rmm - -# Prefetch adaptor wrapping managed memory pool -base = rmm.mr.ManagedMemoryResource() -pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) -prefetch = rmm.mr.PrefetchResourceAdaptor(pool) -buffer = rmm.DeviceBuffer(size=1024, mr=prefetch) + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [prefetch-composition]" +end-before: "# [/prefetch-composition]" +dedent: +--- ``` **Statistics tracking** (see [Logging and Profiling](logging.md)): -```python -import rmm -# Track allocation statistics (counts, peak, and total bytes) -base = rmm.mr.CudaAsyncMemoryResource() -stats_mr = rmm.mr.StatisticsResourceAdaptor(base) -buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [statistics-composition]" +end-before: "# [/statistics-composition]" +dedent: +--- ``` **Allocation logging** (see [Logging and Profiling](logging.md)): -```python -import rmm -# Log every allocation and deallocation to a file -base = rmm.mr.CudaAsyncMemoryResource() -logging_mr = rmm.mr.LoggingResourceAdaptor(base, log_file_name="allocations.csv") -buffer = rmm.DeviceBuffer(size=1024, mr=logging_mr) +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [logging-composition]" +end-before: "# [/logging-composition]" +dedent: +--- ``` ## Multi-Library Applications @@ -194,16 +195,14 @@ When using RMM with multiple GPU libraries (e.g., cuDF, PyTorch, CuPy), configur Each library must be explicitly configured to use RMM. RMM provides allocator integrations for common libraries: **Example: RMM + PyTorch** -```python -import rmm -import torch -from rmm.allocators.torch import rmm_torch_allocator -# Configure RMM -rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) - -# Configure PyTorch to allocate through RMM -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [multi-library-pytorch]" +end-before: "# [/multi-library-pytorch]" +dedent: +--- ``` With this setup, both PyTorch and any other RMM-configured library (like cuDF) allocate from the same resource. @@ -212,11 +211,13 @@ With this setup, both PyTorch and any other RMM-configured library (like cuDF) a 1. **Set the memory resource before any allocations**: Changing the resource after allocations have been made can lead to crashes. - ```python - import rmm - - # Do this first, before any GPU allocations - rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + ```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py + --- + language: python + start-after: "# [best-practices-set-early]" + end-before: "# [/best-practices-set-early]" + dedent: + --- ``` 2. **Use adaptors for diagnostics**: Wrap with {cpp:class}`~rmm::mr::statistics_resource_adaptor` (C++) / {py:class}`~rmm.mr.StatisticsResourceAdaptor` (Python) to track allocation counts and peak usage, or {cpp:class}`~rmm::mr::logging_resource_adaptor` (C++) / {py:class}`~rmm.mr.LoggingResourceAdaptor` (Python) to log every allocation and deallocation (see [Logging and Profiling](logging.md)). diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index 110f4665f..839025e40 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -45,18 +45,25 @@ Memory resources control how device memory is allocated. RMM provides several re The preferred way to use a memory resource is to pass it explicitly when allocating memory. This makes it clear which resource handles each allocation: `````{tabs} -````{code-tab} c++ -rmm::mr::cuda_async_memory_resource async_mr; -rmm::cuda_stream stream; - -// Pass the resource explicitly -rmm::device_buffer buffer(1024, stream.view(), async_mr); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [explicit-resource]" +end-before: "// [/explicit-resource]" +dedent: +--- +``` ```` -````{code-tab} python -mr = rmm.mr.CudaAsyncMemoryResource() - -# Pass the resource explicitly -buffer = rmm.DeviceBuffer(size=1024, mr=mr) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [explicit-resource]" +end-before: "# [/explicit-resource]" +dedent: +--- +``` ```` ````` @@ -65,24 +72,25 @@ buffer = rmm.DeviceBuffer(size=1024, mr=mr) RMM also provides a global "current device resource" that is used when no resource is passed explicitly: `````{tabs} -````{code-tab} c++ -#include -#include - -rmm::mr::cuda_async_memory_resource async_mr; -rmm::mr::set_current_device_resource_ref(async_mr); - -// Allocations that don't specify a resource use the current device resource -rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [current-device-resource]" +end-before: "// [/current-device-resource]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -async_mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(async_mr) - -# Allocations that don't specify a resource use the current device resource -mr = rmm.mr.get_current_device_resource() +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [current-device-resource]" +end-before: "# [/current-device-resource]" +dedent: +--- +``` ```` ````` @@ -101,39 +109,25 @@ RMM provides RAII containers that automatically manage device memory lifetime. Untyped, uninitialized device memory ({cpp:class}`C++ `, {py:class}`Python `): `````{tabs} -````{code-tab} c++ -#include - -rmm::cuda_stream stream; - -// Allocate 1024 bytes -rmm::device_buffer buffer(1024, stream.view()); - -// Access pointer and size -void* ptr = buffer.data(); -std::size_t size = buffer.size(); - -// Resize (may reallocate) -buffer.resize(2048, stream.view()); - -// Copy construct (deep copy) -rmm::device_buffer buffer2(buffer, stream.view()); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [device-buffer]" +end-before: "// [/device-buffer]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -# Allocate 1024 bytes -buffer = rmm.DeviceBuffer(size=1024) - -# Access pointer and size -ptr = buffer.ptr -size = buffer.size - -# Resize (may reallocate) -buffer.resize(2048) - -# Copy construct (deep copy) -buffer2 = buffer.copy() +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [device-buffer]" +end-before: "# [/device-buffer]" +dedent: +--- +``` ```` ````` @@ -141,53 +135,26 @@ buffer2 = buffer.copy() Typed, uninitialized device vector for trivially copyable types ({cpp:class}`API `): -```cpp -#include -#include -#include - -rmm::cuda_stream stream; - -// Allocate 100 elements -rmm::device_uvector vec(100, stream.view()); - -// Access as pointer -int* ptr = vec.data(); - -// Access as iterators -auto begin = vec.begin(); -auto end = vec.end(); - -// Initialize with Thrust -thrust::fill(rmm::exec_policy(stream.view()), vec.begin(), vec.end(), 42); - -// Resize -vec.resize(200, stream.view()); +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [device-uvector]" +end-before: "// [/device-uvector]" +dedent: +--- ``` ### device_scalar (C++) Single typed element with host-device transfer convenience ({cpp:class}`API `): -```cpp -#include - -rmm::cuda_stream stream; - -// Allocate single int -rmm::device_scalar scalar(stream.view()); - -// Set value from host (async on stream) -scalar.set_value(42, stream.view()); - -// Get value to host (async on stream) -int value = scalar.value(stream.view()); - -// Access device pointer -int* d_ptr = scalar.data(); - -// Pass to kernel -launch_kernel<<<..., stream.value()>>>(scalar.data()); +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [device-scalar]" +end-before: "// [/device-scalar]" +dedent: +--- ``` ## Resource Adaptors @@ -197,61 +164,50 @@ Adaptors wrap resources to add functionality like statistics tracking and loggin ### Statistics Tracking `````{tabs} -````{code-tab} c++ -#include - -rmm::mr::cuda_async_memory_resource cuda_mr; -rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; - -// Allocate using the statistics-wrapped resource -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view(), stats_mr); - -// Get statistics -auto bytes = stats_mr.get_bytes_counter(); -std::cout << "Current bytes: " << bytes.value << "\n"; -std::cout << "Peak bytes: " << bytes.peak << "\n"; -std::cout << "Total bytes: " << bytes.total << "\n"; +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [statistics-tracking]" +end-before: "// [/statistics-tracking]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -# Wrap base resource with statistics adaptor -cuda_mr = rmm.mr.CudaAsyncMemoryResource() -stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) - -# Allocate using the statistics-wrapped resource -buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) - -# Get statistics -stats = stats_mr.allocation_counts -print(f"Current bytes: {stats.current_bytes}") -print(f"Peak bytes: {stats.peak_bytes}") -print(f"Total bytes: {stats.total_bytes}") +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [statistics-tracking]" +end-before: "# [/statistics-tracking]" +dedent: +--- +``` ```` ````` ### Logging `````{tabs} -````{code-tab} c++ -#include - -rmm::mr::cuda_async_memory_resource cuda_mr; -rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "allocations.csv"}; - -// Allocations through log_mr are logged to CSV -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view(), log_mr); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [logging]" +end-before: "// [/logging]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -base_mr = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="allocations.csv") - -# Allocations through log_mr are logged to CSV -buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [logging]" +end-before: "# [/logging]" +dedent: +--- +``` ```` ````` @@ -264,45 +220,25 @@ See [Logging and Profiling](logging.md) for more details. Adaptors can be stacked to combine functionality: `````{tabs} -````{code-tab} c++ -#include -#include -#include -#include - -// Base resource -rmm::mr::cuda_memory_resource cuda_mr; - -// Add pool -rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; - -// Add statistics -rmm::mr::statistics_resource_adaptor stats_mr{pool_mr}; - -// Add logging -rmm::mr::logging_resource_adaptor log_mr{stats_mr, "log.csv"}; - -// Use log_mr for allocations — all allocations are pooled, tracked, and logged -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view(), log_mr); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [composing-resources]" +end-before: "// [/composing-resources]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -# Base resource -cuda_mr = rmm.mr.CudaMemoryResource() - -# Add pool -pool_mr = rmm.mr.PoolMemoryResource(cuda_mr, initial_pool_size=2**30) - -# Add statistics -stats_mr = rmm.mr.StatisticsResourceAdaptor(pool_mr) - -# Add logging -log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") - -# Use log_mr for allocations — all allocations are pooled, tracked, and logged -buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [composing-resources]" +end-before: "# [/composing-resources]" +dedent: +--- +``` ```` ````` @@ -314,25 +250,13 @@ Order matters: outer adaptors see all allocations from inner resources. Use {cpp:class}`rmm::exec_policy_nosync` to make Thrust algorithms use RMM for temporary storage. Passing the resource explicitly makes it clear which resource handles temporaries: -```cpp -#include -#include -#include -#include -#include - -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream; -rmm::device_uvector vec(1000, stream.view(), mr); - -// Fill with descending values -thrust::sequence(rmm::exec_policy_nosync(stream.view(), mr), - vec.begin(), vec.end(), vec.size() - 1, -1); - -// Sort — temporaries allocated from mr -thrust::sort(rmm::exec_policy_nosync(stream.view(), mr), vec.begin(), vec.end()); - -stream.synchronize(); +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [thrust]" +end-before: "// [/thrust]" +dedent: +--- ``` `exec_policy_nosync` allows the Thrust backend to skip stream synchronizations that are not required for correctness, improving performance. Stream-ordered applications using RMM should always prefer `exec_policy_nosync`. If stream synchronizations are required, the application should insert them explicitly before reading device data from the host. @@ -341,37 +265,26 @@ stream.synchronize(); Configure CuPy to use RMM for all device memory allocations ({py:func}`API `): -```python -import rmm -import cupy as cp -from rmm.allocators.cupy import rmm_cupy_allocator - -# Configure RMM -mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) - -# Set CuPy to use RMM -cp.cuda.set_allocator(rmm_cupy_allocator) - -# All CuPy arrays now use RMM -array = cp.zeros(1000) +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [cupy]" +end-before: "# [/cupy]" +dedent: +--- ``` ### Numba (Python) Configure Numba to use RMM for device memory in CUDA JIT-compiled functions ({py:class}`API `): -```python -from numba import cuda -from rmm.allocators.numba import RMMNumbaManager -import rmm - -# Configure RMM -mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) - -# Set Numba to use RMM -cuda.set_memory_manager(RMMNumbaManager) +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [numba]" +end-before: "# [/numba]" +dedent: +--- ``` Or use the environment variable: @@ -384,20 +297,13 @@ NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python script.py Configure PyTorch to use RMM for CUDA tensor allocations ({py:func}`API `): -```python -import rmm -import torch -from rmm.allocators.torch import rmm_torch_allocator - -# Configure RMM -mr = rmm.mr.CudaAsyncMemoryResource() -rmm.mr.set_current_device_resource(mr) - -# Set PyTorch to use RMM -torch.cuda.memory.change_current_allocator(rmm_torch_allocator) - -# All PyTorch tensors now use RMM -tensor = torch.zeros(1000, device='cuda') +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [pytorch]" +end-before: "# [/pytorch]" +dedent: +--- ``` ## Multi-Device Usage @@ -405,52 +311,24 @@ tensor = torch.zeros(1000, device='cuda') For multi-GPU systems, each device can have its own memory resource. Use `set_per_device_resource_ref` (C++) or `set_per_device_resource` (Python) to configure each device before allocating memory on it: `````{tabs} -````{code-tab} c++ -#include -#include -#include -#include - -int num_devices; -cudaGetDeviceCount(&num_devices); - -// Store resources to maintain lifetime (resources are copyable value types) -std::vector resources; - -for (int i = 0; i < num_devices; ++i) { - // Set device BEFORE creating resource - cudaSetDevice(i); - - // Create resource for this device - resources.emplace_back(); - - // Set as per-device resource ref - rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{i}, resources.back()); -} - -// Use device 0 -cudaSetDevice(0); -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view()); // Uses device 0's resource +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [multi-device]" +end-before: "// [/multi-device]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm -from cuda import cuda - -num_devices = cuda.cuDeviceGetCount()[1] - -# Store resources to maintain lifetime -resources = [] - -for device_id in range(num_devices): - # Create resource for this device - mr = rmm.mr.CudaAsyncMemoryResource() - resources.append(mr) - - # Set as per-device resource - rmm.mr.set_per_device_resource(device_id, mr) - -# Use device 0 -buffer = rmm.DeviceBuffer(size=1024) # Uses device 0's resource +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [multi-device]" +end-before: "# [/multi-device]" +dedent: +--- +``` ```` ````` diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md index 61c275dd9..46e781630 100644 --- a/docs/user_guide/installation.md +++ b/docs/user_guide/installation.md @@ -117,21 +117,13 @@ target_link_libraries(your_target PRIVATE rmm::rmm) Create a test file `test_rmm.cpp`: -```cpp -#include -#include -#include -#include - -int main() { - auto mr = rmm::mr::cuda_memory_resource{}; - rmm::mr::set_current_device_resource_ref(mr); - - rmm::device_buffer buf(100, rmm::cuda_stream_view{}); - std::cout << "Allocated " << buf.size() << " bytes\n"; - - return 0; -} +```{literalinclude} ../../cpp/examples/docs/src/installation.cpp +--- +language: cpp +start-after: "// [test-installation]" +end-before: "// [/test-installation]" +dedent: +--- ``` Compile and run: @@ -143,11 +135,11 @@ nvcc -std=c++17 -I/path/to/rmm/include test_rmm.cpp -o test_rmm ### Python -```python -import rmm -print(rmm.__version__) - -# Quick test -buffer = rmm.DeviceBuffer(size=100) -print(f"Allocated {buffer.size} bytes") +```{literalinclude} ../../python/rmm/rmm/tests/examples/installation.py +--- +language: python +start-after: "# [test-installation]" +end-before: "# [/test-installation]" +dedent: +--- ``` diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md index 88b0b4978..d12e045c5 100644 --- a/docs/user_guide/introduction.md +++ b/docs/user_guide/introduction.md @@ -30,19 +30,24 @@ All containers accept a stream and a memory resource, and use stream-ordered all ## Basic Example `````{tabs} -````{code-tab} c++ -#include -#include - -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view(), mr); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/introduction.cpp +--- +language: cpp +start-after: "// [basic-example]" +end-before: "// [/basic-example]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -mr = rmm.mr.CudaAsyncMemoryResource() -buffer = rmm.DeviceBuffer(size=1024, mr=mr) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/introduction.py +--- +language: python +start-after: "# [basic-example]" +end-before: "# [/basic-example]" +--- +``` ```` ````` diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md index ccac80974..d698678fc 100644 --- a/docs/user_guide/logging.md +++ b/docs/user_guide/logging.md @@ -19,27 +19,25 @@ Memory event logging writes details of every allocation and deallocation to a CS Wrap any memory resource with the logging adaptor to record allocations and deallocations to a CSV file: `````{tabs} -````{code-tab} c++ -#include -#include - -rmm::mr::cuda_async_memory_resource cuda_mr; -rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "memory_log.csv"}; - -// Allocations through log_mr are logged to CSV -rmm::cuda_stream stream; -rmm::device_buffer buf1(1024, stream.view(), log_mr); -rmm::device_buffer buf2(2048, stream.view(), log_mr); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [logging-adaptor]" +end-before: "// [/logging-adaptor]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -base_mr = rmm.mr.CudaAsyncMemoryResource() -log_mr = rmm.mr.LoggingResourceAdaptor(base_mr, log_file_name="memory_log.csv") - -# Allocations through log_mr are logged to CSV -buf1 = rmm.DeviceBuffer(size=1024, mr=log_mr) -buf2 = rmm.DeviceBuffer(size=2048, mr=log_mr) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [logging-adaptor]" +end-before: "# [/logging-adaptor]" +dedent: +--- +``` ```` ````` @@ -76,27 +74,13 @@ Thread,Time,Action,Pointer,Size,Stream You can parse and analyze logs with Python: -```python -import pandas as pd - -# Read log file -df = pd.read_csv("memory_log.csv") - -# Total bytes allocated -total_allocated = df[df['Action'] == 'allocate']['Size'].sum() -print(f"Total allocated: {total_allocated:,} bytes") - -# Allocation size distribution -print(df[df['Action'] == 'allocate']['Size'].describe()) - -# Peak memory usage (simple analysis) -df['Delta'] = df.apply( - lambda row: row['Size'] if row['Action'] == 'allocate' else -row['Size'], - axis=1 -) -df['Cumulative'] = df['Delta'].cumsum() -peak = df['Cumulative'].max() -print(f"Peak usage: {peak:,} bytes") +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [analyzing-logs]" +end-before: "# [/analyzing-logs]" +dedent: +--- ``` ### Replay Benchmark @@ -120,91 +104,50 @@ RMM provides statistics tracking for allocations using `statistics_resource_adap ### Using the Statistics Adaptor `````{tabs} -````{code-tab} c++ -#include -#include -#include - -rmm::mr::cuda_async_memory_resource cuda_mr; -rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; - -// Allocate using the statistics-wrapped resource -rmm::cuda_stream stream; -rmm::device_buffer buf1(1024, stream.view(), stats_mr); -rmm::device_buffer buf2(2048, stream.view(), stats_mr); - -// Get statistics -auto bytes = stats_mr.get_bytes_counter(); -auto allocs = stats_mr.get_allocations_counter(); -std::cout << "Current bytes: " << bytes.value << "\n"; -std::cout << "Peak bytes: " << bytes.peak << "\n"; -std::cout << "Allocation count: " << allocs.value << "\n"; +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [statistics-adaptor]" +end-before: "// [/statistics-adaptor]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -cuda_mr = rmm.mr.CudaAsyncMemoryResource() -stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) - -# Allocate using the statistics-wrapped resource -buf1 = rmm.DeviceBuffer(size=1024, mr=stats_mr) -buf2 = rmm.DeviceBuffer(size=2048, mr=stats_mr) - -# Get statistics -stats = stats_mr.allocation_counts -print(f"Current bytes: {stats.current_bytes}") -print(f"Peak bytes: {stats.peak_bytes}") -print(f"Total allocations: {stats.total_count}") +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [statistics-adaptor]" +end-before: "# [/statistics-adaptor]" +dedent: +--- +``` ```` ````` Python also provides a convenience API for enabling statistics globally: -```python -import rmm - -# Enable statistics globally -rmm.statistics.enable_statistics() - -# Or use context manager for specific code blocks -with rmm.statistics.statistics(): - buffer = rmm.DeviceBuffer(size=1024) - - stats = rmm.statistics.get_statistics() - print(f"Current bytes: {stats.current_bytes}") - print(f"Peak bytes: {stats.peak_bytes}") - print(f"Total allocations: {stats.total_count}") +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [statistics-global]" +end-before: "# [/statistics-global]" +dedent: +--- ``` ### Tracking Memory Growth Monitor memory usage over time: -```python -import rmm -import time - -rmm.statistics.enable_statistics() - -def checkpoint(label): - stats = rmm.statistics.get_statistics() - print(f"{label}:") - print(f" Current: {stats.current_bytes:,} bytes ({stats.current_count} allocations)") - print(f" Peak: {stats.peak_bytes:,} bytes") - -checkpoint("Start") - -# Allocate -buffers = [rmm.DeviceBuffer(size=1024*1024) for _ in range(10)] -checkpoint("After 10x1MB allocations") - -# Free some -buffers = buffers[:5] -checkpoint("After freeing 5") - -# Allocate more -buffers.extend([rmm.DeviceBuffer(size=2*1024*1024) for _ in range(5)]) -checkpoint("After 5x2MB allocations") +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [tracking-memory-growth]" +end-before: "# [/tracking-memory-growth]" +dedent: +--- ``` ## Memory Profiling (Python) @@ -213,63 +156,37 @@ The memory profiler tracks allocations by function/code block. ### Profiling Functions -```python -import rmm - -# Enable statistics first -rmm.statistics.enable_statistics() - -# Profile a function -@rmm.statistics.profiler() -def process_data(size): - buffer = rmm.DeviceBuffer(size=size) - # ... processing ... - return buffer - -# Run function -process_data(1000000) - -# View report -print(rmm.statistics.default_profiler_records.report()) +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [profiling-functions]" +end-before: "# [/profiling-functions]" +dedent: +--- ``` The report shows the number of calls, peak memory, and total memory for each profiled function. ### Profiling Code Blocks -```python -import rmm - -rmm.statistics.enable_statistics() - -# Profile specific code blocks -with rmm.statistics.profiler(name="data loading"): - data = rmm.DeviceBuffer(size=1000000) - -with rmm.statistics.profiler(name="processing"): - buffer1 = rmm.DeviceBuffer(size=500000) - buffer2 = rmm.DeviceBuffer(size=500000) - -# View report -print(rmm.statistics.default_profiler_records.report()) +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [profiling-code-blocks]" +end-before: "# [/profiling-code-blocks]" +dedent: +--- ``` ### Nested Profiling -```python -import rmm - -rmm.statistics.enable_statistics() - -with rmm.statistics.profiler(name="outer"): - buffer1 = rmm.DeviceBuffer(size=1000) - - with rmm.statistics.profiler(name="inner"): - buffer2 = rmm.DeviceBuffer(size=2000) - - buffer3 = rmm.DeviceBuffer(size=500) - -print(rmm.statistics.default_profiler_records.report()) +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [nested-profiling]" +end-before: "# [/nested-profiling]" +dedent: +--- ``` The report includes entries for both the outer and inner profiling scopes. @@ -278,20 +195,13 @@ The report includes entries for both the outer and inner profiling scopes. Use custom profiler records for separate tracking: -```python -import rmm - -rmm.statistics.enable_statistics() - -# Create custom profiler records -custom_records = rmm.statistics.profiler_records() - -# Use with context manager -with rmm.statistics.profiler(name="my operation", records=custom_records): - buffer = rmm.DeviceBuffer(size=1024) - -# View only custom records -print(custom_records.report()) +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [custom-profiler-records]" +end-before: "# [/custom-profiler-records]" +dedent: +--- ``` ## Debug Logging @@ -332,16 +242,25 @@ Available levels (increasing verbosity): Even with verbose logging compiled in, you must enable it at runtime: `````{tabs} -````{code-tab} c++ -#include - -rmm::default_logger().set_level(rapids_logger::level_enum::trace); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [debug-log-level]" +end-before: "// [/debug-log-level]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -# Available levels: "trace", "debug", "info", "warn", "error", "critical", "off" -rmm.set_logging_level("trace") +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [debug-log-level]" +end-before: "# [/debug-log-level]" +dedent: +--- +``` ```` ````` @@ -368,55 +287,25 @@ Example debug output: Multiple logging features can be composed together by stacking adaptors: `````{tabs} -````{code-tab} c++ -#include -#include -#include -#include - -// Set debug log level -rmm::default_logger().set_level(rapids_logger::level_enum::debug); - -// Build resource stack: statistics + logging -rmm::mr::cuda_async_memory_resource cuda_mr; -rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; -rmm::mr::logging_resource_adaptor log_mr{stats_mr, "events.csv"}; - -// All allocations through log_mr are tracked and logged -rmm::cuda_stream stream; -rmm::device_buffer buffer(1024, stream.view(), log_mr); - -// Get statistics -auto bytes = stats_mr.get_bytes_counter(); -std::cout << "Peak bytes: " << bytes.peak << "\n"; +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [combining-features]" +end-before: "// [/combining-features]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm - -# Set debug log level -rmm.set_logging_level("debug") - -# Build resource stack: statistics + logging -cuda_mr = rmm.mr.CudaAsyncMemoryResource() -stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) -log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="events.csv") - -# All allocations through log_mr are tracked and logged -buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) - -# Get statistics -stats = stats_mr.allocation_counts -print(f"Peak bytes: {stats.peak_bytes}") - -# Profiling can also be used alongside event logging -rmm.statistics.enable_statistics() - -@rmm.statistics.profiler() -def my_function(): - return rmm.DeviceBuffer(size=1024, mr=log_mr) - -my_function() -print(rmm.statistics.default_profiler_records.report()) +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [combining-features]" +end-before: "# [/combining-features]" +dedent: +--- +``` ```` ````` @@ -424,75 +313,35 @@ print(rmm.statistics.default_profiler_records.report()) ### Debugging OOM Errors -```python -import rmm - -# Enable detailed logging -base_mr = rmm.mr.CudaAsyncMemoryResource() -stats_mr = rmm.mr.StatisticsResourceAdaptor(base_mr) -log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="oom_debug.csv") -rmm.set_logging_level("debug") - -# Run problematic code -try: - large_buffer = rmm.DeviceBuffer(size=100 * 2**30, mr=log_mr) # 100 GiB -except MemoryError as e: - stats = stats_mr.allocation_counts - print(f"Peak before OOM: {stats.peak_bytes / 2**30:.2f} GiB") - print(f"Check oom_debug.csv for allocation history") - raise +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [debugging-oom]" +end-before: "# [/debugging-oom]" +dedent: +--- ``` ### Profiling Memory in Data Pipeline -```python -import rmm - -rmm.statistics.enable_statistics() - -@rmm.statistics.profiler() -def load_data(): - return rmm.DeviceBuffer(size=1000000) - -@rmm.statistics.profiler() -def process_data(buffer): - temp = rmm.DeviceBuffer(size=2000000) - result = rmm.DeviceBuffer(size=500000) - return result - -@rmm.statistics.profiler() -def save_data(buffer): - pass - -# Run pipeline -data = load_data() -result = process_data(data) -save_data(result) - -# Identify memory hotspots -print(rmm.statistics.default_profiler_records.report()) +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [profiling-pipeline]" +end-before: "# [/profiling-pipeline]" +dedent: +--- ``` ### Benchmarking Memory Resources -```python -import rmm -import time - -def benchmark_allocations(mr_name, mr): - start = time.time() - buffers = [] - for _ in range(1000): - buffers.append(rmm.DeviceBuffer(size=1024, mr=mr)) - end = time.time() - - print(f"{mr_name}: {(end - start) * 1000:.2f} ms for 1000 allocations") - -# Compare resources -benchmark_allocations("CudaMemoryResource", rmm.mr.CudaMemoryResource()) -benchmark_allocations("CudaAsyncMemoryResource", rmm.mr.CudaAsyncMemoryResource()) -pool_mr = rmm.mr.PoolMemoryResource(rmm.mr.CudaMemoryResource(), initial_pool_size=2**20) -benchmark_allocations("PoolMemoryResource", pool_mr) +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [benchmarking-resources]" +end-before: "# [/benchmarking-resources]" +dedent: +--- ``` ## Best Practices diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md index fbaecdfbd..ec5fb4be8 100644 --- a/docs/user_guide/managed_memory.md +++ b/docs/user_guide/managed_memory.md @@ -16,30 +16,24 @@ Prefetching migrates data to the GPU ahead of time so that kernels find it alrea {cpp:class}`~rmm::mr::prefetch_resource_adaptor` (C++) / {py:class}`~rmm.mr.PrefetchResourceAdaptor` (Python) wraps another resource and prefetches each allocation to the current device as soon as it's made. This works well when data is used on the GPU shortly after allocation, such as when copying or writing to the new allocation: -```python -import rmm - -managed_mr = rmm.mr.ManagedMemoryResource() -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(managed_mr) - -# This allocation is prefetched to the GPU automatically -buffer = rmm.DeviceBuffer(size=1000000, mr=prefetch_mr) +```{literalinclude} ../../python/rmm/rmm/tests/examples/managed_memory.py +--- +language: python +start-after: "# [prefetch-on-allocate]" +end-before: "# [/prefetch-on-allocate]" +dedent: +--- ``` Adding a pool between the managed resource and the prefetch adaptor avoids calling `cudaMallocManaged` on every allocation. The pool grabs large chunks of managed memory upfront, and the prefetch adaptor ensures each suballocation is migrated to the GPU before use. Non-allocating adaptors like logging or statistics can safely wrap the prefetch adaptor on the outside: -```python -import rmm - -managed_mr = rmm.mr.ManagedMemoryResource() -pool_mr = rmm.mr.PoolMemoryResource(managed_mr, initial_pool_size=2**30) -prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool_mr) - -# Logging and statistics don't allocate, so they can go on the outside -stats_mr = rmm.mr.StatisticsResourceAdaptor(prefetch_mr) -log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") - -buffer = rmm.DeviceBuffer(size=1000000, mr=log_mr) +```{literalinclude} ../../python/rmm/rmm/tests/examples/managed_memory.py +--- +language: python +start-after: "# [prefetch-with-pool]" +end-before: "# [/prefetch-with-pool]" +dedent: +--- ``` ### Prefetch on Access (Lazy) @@ -47,34 +41,25 @@ buffer = rmm.DeviceBuffer(size=1000000, mr=log_mr) When you need control over exactly when data moves to the GPU — for instance because the allocation happens long before the kernel that consumes it — you can prefetch manually: `````{tabs} -````{code-tab} c++ -#include -#include -#include - -rmm::mr::managed_memory_resource managed_mr; -rmm::cuda_stream stream; -rmm::device_buffer buffer(1000000, stream.view(), managed_mr); - -// Prefetch to the current device on this stream -rmm::prefetch(buffer.data(), buffer.size(), - rmm::get_current_cuda_device(), stream.view()); - -// Kernel on the same stream finds the data already resident -launch_kernel<<>>(buffer.data()); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/managed_memory.cu +--- +language: cuda +start-after: "// [prefetch-on-access]" +end-before: "// [/prefetch-on-access]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm -from rmm.pylibrmm.stream import Stream - -managed_mr = rmm.mr.ManagedMemoryResource() -buffer = rmm.DeviceBuffer(size=1000000, mr=managed_mr) - -# Prefetch to device 0 on this stream -stream = Stream() -buffer.prefetch(device=0, stream=stream) - -# Kernel on the same stream finds the data already resident +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/managed_memory.py +--- +language: python +start-after: "# [prefetch-on-access]" +end-before: "# [/prefetch-on-access]" +dedent: +--- +``` ```` ````` diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md index b025f3317..94b90c169 100644 --- a/docs/user_guide/stream_ordered_allocation.md +++ b/docs/user_guide/stream_ordered_allocation.md @@ -7,26 +7,25 @@ RMM containers ({cpp:class}`~rmm::device_buffer`, {py:class}`~rmm.DeviceBuffer`) When you allocate from a stream-ordered resource, the call returns a pointer immediately. The pointer value is available on the CPU right away — you can store it, pass it to kernel launch arguments, or hand it to another API. The memory backing behind the pointer becomes available for GPU operations enqueued on the same stream after the allocation: `````{tabs} -````{code-tab} c++ -#include -#include - -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream; -rmm::device_buffer buffer(1000, stream.view(), mr); - -// buffer.data() is usable immediately in stream-ordered operations -launch_kernel<<>>(buffer.data()); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [how-it-works]" +end-before: "// [/how-it-works]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm -from rmm.pylibrmm.stream import Stream - -mr = rmm.mr.CudaAsyncMemoryResource() -stream = Stream() -buffer = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) - -# buffer.ptr is usable immediately in stream-ordered operations +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [how-it-works]" +end-before: "# [/how-it-works]" +dedent: +--- +``` ```` ````` @@ -39,35 +38,25 @@ Deallocations are also stream-ordered: when a buffer is destroyed, the deallocat The pointer returned by a stream-ordered allocation is a CPU value — you can store it or pass it to other APIs without synchronization. However, the stream must be synchronized before the CPU reads data that was written by GPU operations on that stream. The most common case is a device-to-host copy followed by a sync: `````{tabs} -````{code-tab} c++ -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream; -rmm::device_buffer d_buf(1000 * sizeof(float), stream.view(), mr); - -// Launch kernel that writes to d_buf on stream ... - -// Copy results to host on the same stream -std::vector h_buf(1000); -cudaMemcpyAsync(h_buf.data(), d_buf.data(), d_buf.size(), - cudaMemcpyDeviceToHost, stream.value()); - -// Synchronize before reading h_buf on the CPU -stream.synchronize(); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [reading-results]" +end-before: "// [/reading-results]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm -from rmm.pylibrmm.stream import Stream - -mr = rmm.mr.CudaAsyncMemoryResource() -stream = Stream() -d_buf = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) - -# ... GPU work writes to d_buf on stream ... - -# Async copy to host on the same stream, then sync before reading -h_buf = bytearray(d_buf.size) -d_buf.copy_to_host(h_buf, stream) -stream.synchronize() +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [reading-results]" +end-before: "# [/reading-results]" +dedent: +--- +``` ```` ````` @@ -76,51 +65,25 @@ stream.synchronize() Memory allocated on one stream can only be safely used on a different stream after the allocation is known to have completed. The simplest approach is to synchronize the allocating stream, but that stalls the CPU. A lighter-weight alternative is to record a CUDA event on the allocating stream and have the consuming stream wait on it: `````{tabs} -````{code-tab} c++ -#include -#include - -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream_a; -rmm::cuda_stream stream_b; - -rmm::device_buffer buffer(1000, stream_a.view(), mr); - -// Record an event after the allocation on stream_a -cudaEvent_t event; -cudaEventCreateWithFlags(&event, cudaEventDisableTiming); -cudaEventRecord(event, stream_a.value()); - -// stream_b waits for the event — no CPU synchronization needed -cudaStreamWaitEvent(stream_b.value(), event); - -// Now safe to use buffer.data() in operations on stream_b -launch_kernel<<>>(buffer.data()); - -cudaEventDestroy(event); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [cross-stream]" +end-before: "// [/cross-stream]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm -from rmm.pylibrmm.stream import Stream -from cuda.core import Device - -dev = Device() -dev.set_current() - -mr = rmm.mr.CudaAsyncMemoryResource() -stream_a = dev.create_stream() -stream_b = dev.create_stream() - -buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) - -# Record an event after the allocation on stream_a -alloc_event = dev.create_event(options={"enable_timing": False}) -stream_a.record(alloc_event) - -# stream_b waits for the event — no CPU synchronization needed -stream_b.wait(alloc_event) - -# Now safe to use buffer.ptr in operations on stream_b +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [cross-stream]" +end-before: "# [/cross-stream]" +dedent: +--- +``` ```` ````` @@ -129,62 +92,25 @@ stream_b.wait(alloc_event) If a buffer is allocated and used on the same stream, deallocation is safe — stream ordering guarantees prior work completes first. The problem arises when a buffer is used on a *different* stream from the one it will be deallocated on. In that case, you need to ensure the consuming stream's work finishes before the buffer is destroyed. The same event pattern works here — record an event on the consuming stream and have the deallocating stream wait on it: `````{tabs} -````{code-tab} c++ -rmm::mr::cuda_async_memory_resource mr; -rmm::cuda_stream stream_a; -rmm::cuda_stream stream_b; - -rmm::device_buffer buffer(1000, stream_a.view(), mr); - -// Make stream_b wait for the allocation on stream_a -cudaEvent_t alloc_event; -cudaEventCreateWithFlags(&alloc_event, cudaEventDisableTiming); -cudaEventRecord(alloc_event, stream_a.value()); -cudaStreamWaitEvent(stream_b.value(), alloc_event); - -// Use buffer on stream_b -launch_kernel<<>>(buffer.data()); - -// Before destroying buffer, make stream_a wait for stream_b's work -cudaEvent_t done_event; -cudaEventCreateWithFlags(&done_event, cudaEventDisableTiming); -cudaEventRecord(done_event, stream_b.value()); -cudaStreamWaitEvent(stream_a.value(), done_event); - -// Now safe to destroy buffer — deallocation on stream_a is ordered after the kernel on stream_b -buffer = rmm::device_buffer{}; - -cudaEventDestroy(alloc_event); -cudaEventDestroy(done_event); +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [buffer-lifetime]" +end-before: "// [/buffer-lifetime]" +dedent: +--- +``` ```` -````{code-tab} python -import rmm -from rmm.pylibrmm.stream import Stream -from cuda.core import Device - -dev = Device() -dev.set_current() - -mr = rmm.mr.CudaAsyncMemoryResource() -stream_a = dev.create_stream() -stream_b = dev.create_stream() - -buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) - -# Make stream_b wait for the allocation on stream_a -alloc_event = dev.create_event(options={"enable_timing": False}) -stream_a.record(alloc_event) -stream_b.wait(alloc_event) - -# Use buffer on stream_b ... - -# Before destroying buffer, make stream_a wait for stream_b's work -done_event = dev.create_event(options={"enable_timing": False}) -stream_b.record(done_event) -stream_a.wait(done_event) - -# Now safe to destroy buffer -del buffer +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [buffer-lifetime]" +end-before: "# [/buffer-lifetime]" +dedent: +--- +``` ```` ````` @@ -200,30 +126,13 @@ del buffer This example allocates an RMM buffer and launches a Numba kernel on the same stream, so the allocation is guaranteed to complete before the kernel accesses the memory: -```python -import rmm -from rmm.pylibrmm.stream import Stream -from cuda.core import Device -from numba import cuda - -dev = Device() -dev.set_current() - -@cuda.jit -def kernel(data, n): - idx = cuda.grid(1) - if idx < n: - data[idx] = idx * 2 - -mr = rmm.mr.CudaAsyncMemoryResource() -stream = dev.create_stream() - -buffer = rmm.DeviceBuffer(size=1000 * 4, stream=Stream(obj=stream), mr=mr) - -numba_stream = cuda.external_stream(int(stream.handle)) -kernel[100, 10, numba_stream](cuda.as_cuda_array(buffer).view('float32'), 1000) - -stream.sync() +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [numba-stream]" +end-before: "# [/numba-stream]" +dedent: +--- ``` ## See Also diff --git a/python/rmm/rmm/tests/examples/choosing_memory_resources.py b/python/rmm/rmm/tests/examples/choosing_memory_resources.py new file mode 100644 index 000000000..1403b6281 --- /dev/null +++ b/python/rmm/rmm/tests/examples/choosing_memory_resources.py @@ -0,0 +1,172 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/choosing_memory_resources.md +# ruff: noqa: RUF059 + + +def recommended_default() -> None: + # [recommended-default] + import rmm + + mr = rmm.mr.CudaAsyncMemoryResource() + buffer = rmm.DeviceBuffer(size=1024, mr=mr) + # [/recommended-default] + + assert buffer.size == 1024 + + +def managed_pool_prefetch() -> None: + # [managed-pool-prefetch] + import rmm + + # Use 80% of GPU memory, rounded down to nearest 256 bytes + free_memory, total_memory = rmm.mr.available_device_memory() + pool_size = int(total_memory * 0.8) // 256 * 256 + + mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=pool_size, + ) + ) + # [/managed-pool-prefetch] + + buffer = rmm.DeviceBuffer(size=1024, mr=mr) + assert buffer.size == 1024 + + +def managed_memory_example() -> None: + # [managed-memory-example] + import rmm + + # Combine managed memory with a pool and prefetching for performance. + # Without prefetching, page faults cause significant overhead. + base = rmm.mr.ManagedMemoryResource() + pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) + buffer = rmm.DeviceBuffer(size=1024, mr=prefetch_mr) + # [/managed-memory-example] + + assert buffer.size == 1024 + + +def pool_memory_example() -> None: + # [pool-memory-example] + import rmm + + pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), + initial_pool_size=2**32, # 4 GiB + maximum_pool_size=2**34, # 16 GiB + ) + buffer = rmm.DeviceBuffer(size=1024, mr=pool) + # [/pool-memory-example] + + assert buffer.size == 1024 + + +def composing_adaptor() -> None: + # [composing-adaptor] + # Adaptor wrapping a base resource + import rmm + + adaptor = rmm.mr.StatisticsResourceAdaptor( + rmm.mr.CudaAsyncMemoryResource() + ) + # [/composing-adaptor] + + _ = adaptor + + +def prefetch_composition() -> None: + # [prefetch-composition] + import rmm + + # Prefetch adaptor wrapping managed memory pool + base = rmm.mr.ManagedMemoryResource() + pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) + prefetch = rmm.mr.PrefetchResourceAdaptor(pool) + buffer = rmm.DeviceBuffer(size=1024, mr=prefetch) + # [/prefetch-composition] + + assert buffer.size == 1024 + + +def statistics_composition() -> None: + # [statistics-composition] + import rmm + + # Track allocation statistics (counts, peak, and total bytes) + base = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(base) + buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) + # [/statistics-composition] + + assert buffer.size == 1024 + + +def logging_composition() -> None: + # [logging-composition] + import rmm + + # Log every allocation and deallocation to a file + base = rmm.mr.CudaAsyncMemoryResource() + logging_mr = rmm.mr.LoggingResourceAdaptor( + base, log_file_name="allocations.csv" + ) + buffer = rmm.DeviceBuffer(size=1024, mr=logging_mr) + # [/logging-composition] + + assert buffer.size == 1024 + + import os + + if os.path.exists("allocations.csv"): + os.remove("allocations.csv") + + +def multi_library_pytorch() -> None: + try: + import torch + except ImportError: + print("PyTorch not available, skipping multi_library_pytorch") + return + + # isort: off + # [multi-library-pytorch] + import rmm + import torch + from rmm.allocators.torch import rmm_torch_allocator + + # Configure RMM + rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + + # Configure PyTorch to allocate through RMM + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + # [/multi-library-pytorch] + # isort: on + + +def best_practices_set_early() -> None: + # [best-practices-set-early] + import rmm + + # Do this first, before any GPU allocations + rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + # [/best-practices-set-early] + + +if __name__ == "__main__": + recommended_default() + managed_pool_prefetch() + managed_memory_example() + pool_memory_example() + composing_adaptor() + prefetch_composition() + statistics_composition() + logging_composition() + multi_library_pytorch() + best_practices_set_early() + + print("All choosing_memory_resources examples passed.") diff --git a/python/rmm/rmm/tests/examples/guide.py b/python/rmm/rmm/tests/examples/guide.py new file mode 100644 index 000000000..1cc6ef0c4 --- /dev/null +++ b/python/rmm/rmm/tests/examples/guide.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/guide.md + + +def explicit_resource() -> None: + # [explicit-resource] + import rmm + + mr = rmm.mr.CudaAsyncMemoryResource() + + # Pass the resource explicitly + buffer = rmm.DeviceBuffer(size=1024, mr=mr) + # [/explicit-resource] + + assert buffer.size == 1024 + + +def current_device_resource() -> None: + # [current-device-resource] + import rmm + + async_mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(async_mr) + + # Allocations that don't specify a resource use the current device resource + mr = rmm.mr.get_current_device_resource() + # [/current-device-resource] + + assert mr is not None + + +def device_buffer_example() -> None: + # [device-buffer] + import rmm + + # Allocate 1024 bytes + buffer = rmm.DeviceBuffer(size=1024) + + # Access pointer and size + ptr = buffer.ptr + size = buffer.size + + # Resize (may reallocate) + buffer.resize(2048) + + # Copy construct (deep copy) + buffer2 = buffer.copy() + # [/device-buffer] + + assert buffer.size == 2048 + assert buffer2.size == 2048 + _ = ptr, size + + +def statistics_tracking() -> None: + # [statistics-tracking] + import rmm + + # Wrap base resource with statistics adaptor + cuda_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + + # Allocate using the statistics-wrapped resource + buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) + + # Get statistics + stats = stats_mr.allocation_counts + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total bytes: {stats.total_bytes}") + # [/statistics-tracking] + + assert stats.current_bytes >= 1024 + _ = buffer + + +def logging_example() -> None: + # [logging] + import rmm + + base_mr = rmm.mr.CudaAsyncMemoryResource() + log_mr = rmm.mr.LoggingResourceAdaptor( + base_mr, log_file_name="allocations.csv" + ) + + # Allocations through log_mr are logged to CSV + buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) + # [/logging] + + assert buffer.size == 1024 + + import os + + if os.path.exists("allocations.csv"): + os.remove("allocations.csv") + + +def composing_resources() -> None: + # [composing-resources] + import rmm + + # Base resource + cuda_mr = rmm.mr.CudaMemoryResource() + + # Add pool + pool_mr = rmm.mr.PoolMemoryResource(cuda_mr, initial_pool_size=2**30) + + # Add statistics + stats_mr = rmm.mr.StatisticsResourceAdaptor(pool_mr) + + # Add logging + log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") + + # Use log_mr for allocations — all allocations are pooled, tracked, and logged + buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) + # [/composing-resources] + + assert buffer.size == 1024 + + import os + + if os.path.exists("log.csv"): + os.remove("log.csv") + + +def cupy_example() -> None: + try: + import cupy as cp + except ImportError: + print("CuPy not available, skipping cupy_example") + return + + # isort: off + # [cupy] + import rmm + import cupy as cp + from rmm.allocators.cupy import rmm_cupy_allocator + + # Configure RMM + mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Set CuPy to use RMM + cp.cuda.set_allocator(rmm_cupy_allocator) + + # All CuPy arrays now use RMM + array = cp.zeros(1000) + # [/cupy] + # isort: on + + assert array.shape == (1000,) + + +def numba_example() -> None: + try: + from numba import cuda + except ImportError: + print("Numba not available, skipping numba_example") + return + + # isort: off + # [numba] + from numba import cuda + from rmm.allocators.numba import RMMNumbaManager + import rmm + + # Configure RMM + mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Set Numba to use RMM + cuda.set_memory_manager(RMMNumbaManager) + # [/numba] + # isort: on + + +def pytorch_example() -> None: + try: + import torch + except ImportError: + print("PyTorch not available, skipping pytorch_example") + return + + # isort: off + # [pytorch] + import rmm + import torch + from rmm.allocators.torch import rmm_torch_allocator + + # Configure RMM + mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Set PyTorch to use RMM + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + + # All PyTorch tensors now use RMM + tensor = torch.zeros(1000, device="cuda") + # [/pytorch] + # isort: on + + assert tensor.shape == (1000,) + + +def multi_device_example() -> None: + try: + from cuda.bindings import runtime + except ImportError: + print("cuda.bindings not available, skipping multi_device_example") + return + + _, num_devices = runtime.cudaGetDeviceCount() + if num_devices < 1: + print("No CUDA devices, skipping multi_device_example") + return + + # isort: off + # [multi-device] + import rmm + from cuda.bindings import runtime + + _, num_devices = runtime.cudaGetDeviceCount() + + # Store resources to maintain lifetime + resources = [] + + for device_id in range(num_devices): + # Create resource for this device + mr = rmm.mr.CudaAsyncMemoryResource() + resources.append(mr) + + # Set as per-device resource + rmm.mr.set_per_device_resource(device_id, mr) + + # Use device 0 + buffer = rmm.DeviceBuffer(size=1024) # Uses device 0's resource + # [/multi-device] + # isort: on + + assert buffer.size == 1024 + + +if __name__ == "__main__": + explicit_resource() + current_device_resource() + device_buffer_example() + statistics_tracking() + logging_example() + composing_resources() + cupy_example() + numba_example() + pytorch_example() + multi_device_example() + + print("All guide examples passed.") diff --git a/python/rmm/rmm/tests/examples/installation.py b/python/rmm/rmm/tests/examples/installation.py new file mode 100644 index 000000000..37a1854b8 --- /dev/null +++ b/python/rmm/rmm/tests/examples/installation.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/installation.md + + +def test_installation() -> None: + # [test-installation] + import rmm + + print(rmm.__version__) + + # Quick test + buffer = rmm.DeviceBuffer(size=100) + print(f"Allocated {buffer.size} bytes") + # [/test-installation] + + assert buffer.size == 100 + + +if __name__ == "__main__": + test_installation() + + print("All installation examples passed.") diff --git a/python/rmm/rmm/tests/examples/introduction.py b/python/rmm/rmm/tests/examples/introduction.py new file mode 100644 index 000000000..52cd5850f --- /dev/null +++ b/python/rmm/rmm/tests/examples/introduction.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/introduction.md + +# [basic-example] +import rmm + +mr = rmm.mr.CudaAsyncMemoryResource() +buffer = rmm.DeviceBuffer(size=1024, mr=mr) +# [/basic-example] + +assert buffer.size == 1024 diff --git a/python/rmm/rmm/tests/examples/logging.py b/python/rmm/rmm/tests/examples/logging.py new file mode 100644 index 000000000..5fab3d6f1 --- /dev/null +++ b/python/rmm/rmm/tests/examples/logging.py @@ -0,0 +1,387 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/logging.md + +import os + + +def logging_adaptor() -> None: + # [logging-adaptor] + import rmm + + base_mr = rmm.mr.CudaAsyncMemoryResource() + log_mr = rmm.mr.LoggingResourceAdaptor( + base_mr, log_file_name="memory_log.csv" + ) + + # Allocations through log_mr are logged to CSV + buf1 = rmm.DeviceBuffer(size=1024, mr=log_mr) + buf2 = rmm.DeviceBuffer(size=2048, mr=log_mr) + # [/logging-adaptor] + + assert buf1.size == 1024 + assert buf2.size == 2048 + + if os.path.exists("memory_log.csv"): + os.remove("memory_log.csv") + + +def statistics_adaptor() -> None: + # [statistics-adaptor] + import rmm + + cuda_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + + # Allocate using the statistics-wrapped resource + buf1 = rmm.DeviceBuffer(size=1024, mr=stats_mr) + buf2 = rmm.DeviceBuffer(size=2048, mr=stats_mr) + + # Get statistics + stats = stats_mr.allocation_counts + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total allocations: {stats.total_count}") + # [/statistics-adaptor] + + assert stats.current_bytes >= 1024 + _ = buf1, buf2 + + +def statistics_global() -> None: + # [statistics-global] + import rmm + + # Enable statistics globally + rmm.statistics.enable_statistics() + + # Or use context manager for specific code blocks + with rmm.statistics.statistics(): + buffer = rmm.DeviceBuffer(size=1024) + + stats = rmm.statistics.get_statistics() + assert stats is not None + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total allocations: {stats.total_count}") + # [/statistics-global] + + _ = buffer + + +def tracking_memory_growth() -> None: + # [tracking-memory-growth] + import rmm + + rmm.statistics.enable_statistics() + + def checkpoint(label) -> None: + stats = rmm.statistics.get_statistics() + assert stats is not None + print(f"{label}:") + print( + f" Current: {stats.current_bytes:,} bytes ({stats.current_count} allocations)" + ) + print(f" Peak: {stats.peak_bytes:,} bytes") + + checkpoint("Start") + + # Allocate + buffers = [rmm.DeviceBuffer(size=1024 * 1024) for _ in range(10)] + checkpoint("After 10x1MB allocations") + + # Free some + buffers = buffers[:5] + checkpoint("After freeing 5") + + # Allocate more + buffers.extend([rmm.DeviceBuffer(size=2 * 1024 * 1024) for _ in range(5)]) + checkpoint("After 5x2MB allocations") + # [/tracking-memory-growth] + + +def profiling_functions() -> None: + # [profiling-functions] + import rmm + + # Enable statistics first + rmm.statistics.enable_statistics() + + # Profile a function + @rmm.statistics.profiler() + def process_data(size): + buffer = rmm.DeviceBuffer(size=size) + # ... processing ... + return buffer + + # Run function + process_data(1000000) + + # View report + print(rmm.statistics.default_profiler_records.report()) + # [/profiling-functions] + + +def profiling_code_blocks() -> None: + # [profiling-code-blocks] + import rmm + + rmm.statistics.enable_statistics() + + # Profile specific code blocks + with rmm.statistics.profiler(name="data loading"): + data = rmm.DeviceBuffer(size=1000000) + + with rmm.statistics.profiler(name="processing"): + buffer1 = rmm.DeviceBuffer(size=500000) + buffer2 = rmm.DeviceBuffer(size=500000) + + # View report + print(rmm.statistics.default_profiler_records.report()) + # [/profiling-code-blocks] + + _ = data, buffer1, buffer2 + + +def nested_profiling() -> None: + # [nested-profiling] + import rmm + + rmm.statistics.enable_statistics() + + with rmm.statistics.profiler(name="outer"): + buffer1 = rmm.DeviceBuffer(size=1000) + + with rmm.statistics.profiler(name="inner"): + buffer2 = rmm.DeviceBuffer(size=2000) + + buffer3 = rmm.DeviceBuffer(size=500) + + print(rmm.statistics.default_profiler_records.report()) + # [/nested-profiling] + + _ = buffer1, buffer2, buffer3 + + +def custom_profiler_records() -> None: + # [custom-profiler-records] + import rmm + + rmm.statistics.enable_statistics() + + # Create custom profiler records + custom_records = rmm.statistics.ProfilerRecords() + + # Use with context manager + with rmm.statistics.profiler(name="my operation", records=custom_records): + buffer = rmm.DeviceBuffer(size=1024) + + # View only custom records + print(custom_records.report()) + # [/custom-profiler-records] + + _ = buffer + + +def debug_log_level() -> None: + # [debug-log-level] + import rmm + + # Available levels: trace, debug, info, warn, error, critical, off + rmm.set_logging_level(rmm.level_enum.trace) + # [/debug-log-level] + + # Reset to default + rmm.set_logging_level(rmm.level_enum.info) + + +def combining_features() -> None: + # [combining-features] + import rmm + + # Set debug log level + rmm.set_logging_level(rmm.level_enum.debug) + + # Build resource stack: statistics + logging + cuda_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + log_mr = rmm.mr.LoggingResourceAdaptor( + stats_mr, log_file_name="events.csv" + ) + + # All allocations through log_mr are tracked and logged + buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) + + # Get statistics + stats = stats_mr.allocation_counts + print(f"Peak bytes: {stats.peak_bytes}") + + # Profiling can also be used alongside event logging + rmm.statistics.enable_statistics() + + @rmm.statistics.profiler() + def my_function(): + return rmm.DeviceBuffer(size=1024, mr=log_mr) + + my_function() + print(rmm.statistics.default_profiler_records.report()) + # [/combining-features] + + # Reset to default + rmm.set_logging_level(rmm.level_enum.info) + _ = buffer + + if os.path.exists("events.csv"): + os.remove("events.csv") + + +def debugging_oom() -> None: + # [debugging-oom] + import rmm + + # Enable detailed logging + base_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(base_mr) + log_mr = rmm.mr.LoggingResourceAdaptor( + stats_mr, log_file_name="oom_debug.csv" + ) + rmm.set_logging_level(rmm.level_enum.debug) + + # Run problematic code + try: + large_buffer = rmm.DeviceBuffer(size=100 * 2**30, mr=log_mr) # noqa: F841 + except MemoryError: + stats = stats_mr.allocation_counts + print(f"Peak before OOM: {stats.peak_bytes / 2**30:.2f} GiB") + print("Check oom_debug.csv for allocation history") + raise + # [/debugging-oom] + + +def profiling_pipeline() -> None: + # [profiling-pipeline] + import rmm + + rmm.statistics.enable_statistics() + + @rmm.statistics.profiler() + def load_data(): + return rmm.DeviceBuffer(size=1000000) + + @rmm.statistics.profiler() + def process_data(buffer): + temp = rmm.DeviceBuffer(size=2000000) # noqa: F841 + result = rmm.DeviceBuffer(size=500000) + return result + + @rmm.statistics.profiler() + def save_data(buffer): + pass + + # Run pipeline + data = load_data() + result = process_data(data) + save_data(result) + + # Identify memory hotspots + print(rmm.statistics.default_profiler_records.report()) + # [/profiling-pipeline] + + +def benchmarking_resources() -> None: + # isort: off + # [benchmarking-resources] + import rmm + import time + + def benchmark_allocations(mr_name, mr) -> None: + start = time.time() + buffers = [] + for _ in range(1000): + buffers.append(rmm.DeviceBuffer(size=1024, mr=mr)) + end = time.time() + + print(f"{mr_name}: {(end - start) * 1000:.2f} ms for 1000 allocations") + + # Compare resources + benchmark_allocations("CudaMemoryResource", rmm.mr.CudaMemoryResource()) + benchmark_allocations( + "CudaAsyncMemoryResource", rmm.mr.CudaAsyncMemoryResource() + ) + pool_mr = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), initial_pool_size=2**20 + ) + benchmark_allocations("PoolMemoryResource", pool_mr) + # [/benchmarking-resources] + # isort: on + + +def analyzing_logs() -> None: + # Generate a sample log file for the example + import rmm + + base_mr = rmm.mr.CudaAsyncMemoryResource() + log_mr = rmm.mr.LoggingResourceAdaptor( + base_mr, log_file_name="memory_log.csv" + ) + buf1 = rmm.DeviceBuffer(size=1024, mr=log_mr) + buf2 = rmm.DeviceBuffer(size=2048, mr=log_mr) + del buf1 + + try: + import pandas as pd # type: ignore[import-untyped] + except ImportError: + print("pandas not available, skipping analyzing_logs") + if os.path.exists("memory_log.csv"): + os.remove("memory_log.csv") + return + + # [analyzing-logs] + import pandas as pd + + # Read log file + df = pd.read_csv("memory_log.csv") + + # Total bytes allocated + total_allocated = df[df["Action"] == "allocate"]["Size"].sum() + print(f"Total allocated: {total_allocated:,} bytes") + + # Allocation size distribution + print(df[df["Action"] == "allocate"]["Size"].describe()) + + # Peak memory usage (simple analysis) + df["Delta"] = df.apply( + lambda row: row["Size"] + if row["Action"] == "allocate" + else -row["Size"], + axis=1, + ) + df["Cumulative"] = df["Delta"].cumsum() + peak = df["Cumulative"].max() + print(f"Peak usage: {peak:,} bytes") + # [/analyzing-logs] + + _ = buf2 + + if os.path.exists("memory_log.csv"): + os.remove("memory_log.csv") + + +if __name__ == "__main__": + logging_adaptor() + statistics_adaptor() + statistics_global() + tracking_memory_growth() + profiling_functions() + profiling_code_blocks() + nested_profiling() + custom_profiler_records() + debug_log_level() + combining_features() + # debugging_oom() — intentionally skipped (raises MemoryError) + profiling_pipeline() + benchmarking_resources() + analyzing_logs() + + print("All logging examples passed.") diff --git a/python/rmm/rmm/tests/examples/managed_memory.py b/python/rmm/rmm/tests/examples/managed_memory.py new file mode 100644 index 000000000..642cd390d --- /dev/null +++ b/python/rmm/rmm/tests/examples/managed_memory.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/managed_memory.md + +import os + + +def prefetch_on_allocate() -> None: + # [prefetch-on-allocate] + import rmm + + managed_mr = rmm.mr.ManagedMemoryResource() + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(managed_mr) + + # This allocation is prefetched to the GPU automatically + buffer = rmm.DeviceBuffer(size=1000000, mr=prefetch_mr) + # [/prefetch-on-allocate] + + assert buffer.size == 1000000 + + +def prefetch_with_pool() -> None: + # [prefetch-with-pool] + import rmm + + managed_mr = rmm.mr.ManagedMemoryResource() + pool_mr = rmm.mr.PoolMemoryResource(managed_mr, initial_pool_size=2**30) + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool_mr) + + # Logging and statistics don't allocate, so they can go on the outside + stats_mr = rmm.mr.StatisticsResourceAdaptor(prefetch_mr) + log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") + + buffer = rmm.DeviceBuffer(size=1000000, mr=log_mr) + # [/prefetch-with-pool] + + assert buffer.size == 1000000 + + if os.path.exists("log.csv"): + os.remove("log.csv") + + +def prefetch_on_access() -> None: + # [prefetch-on-access] + import rmm + from rmm.pylibrmm.stream import Stream + + managed_mr = rmm.mr.ManagedMemoryResource() + buffer = rmm.DeviceBuffer(size=1000000, mr=managed_mr) + + # Prefetch to device 0 on this stream + stream = Stream() + buffer.prefetch(device=0, stream=stream) + + # Kernel on the same stream finds the data already resident + # [/prefetch-on-access] + + +if __name__ == "__main__": + prefetch_on_allocate() + prefetch_with_pool() + prefetch_on_access() + + print("All managed_memory examples passed.") diff --git a/python/rmm/rmm/tests/examples/stream_ordered_allocation.py b/python/rmm/rmm/tests/examples/stream_ordered_allocation.py new file mode 100644 index 000000000..a7c45d7f0 --- /dev/null +++ b/python/rmm/rmm/tests/examples/stream_ordered_allocation.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/stream_ordered_allocation.md + + +def how_it_works() -> None: + # [how-it-works] + import rmm + from rmm.pylibrmm.stream import Stream + + mr = rmm.mr.CudaAsyncMemoryResource() + stream = Stream() + buffer = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) + + # buffer.ptr is usable immediately in stream-ordered operations + # [/how-it-works] + + assert buffer.size == 1000 + + +def reading_results() -> None: + # [reading-results] + import rmm + from rmm.pylibrmm.stream import Stream + + mr = rmm.mr.CudaAsyncMemoryResource() + stream = Stream() + d_buf = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) + + # ... GPU work writes to d_buf on stream ... + + # Async copy to host on the same stream, then sync before reading + h_buf = bytearray(d_buf.size) + d_buf.copy_to_host(h_buf, stream) + stream.synchronize() + # [/reading-results] + + +def cross_stream() -> None: + # isort: off + # [cross-stream] + import rmm + from rmm.pylibrmm.stream import Stream + from cuda.core import Device + + dev = Device() + dev.set_current() + + mr = rmm.mr.CudaAsyncMemoryResource() + stream_a = dev.create_stream() + stream_b = dev.create_stream() + + buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) + + # Record an event after the allocation on stream_a + alloc_event = dev.create_event(options={"enable_timing": False}) + stream_a.record(alloc_event) + + # stream_b waits for the event — no CPU synchronization needed + stream_b.wait(alloc_event) + + # Now safe to use buffer.ptr in operations on stream_b + # [/cross-stream] + # isort: on + + assert buffer.size == 1000 + + +def buffer_lifetime() -> None: + # isort: off + # [buffer-lifetime] + import rmm + from rmm.pylibrmm.stream import Stream + from cuda.core import Device + + dev = Device() + dev.set_current() + + mr = rmm.mr.CudaAsyncMemoryResource() + stream_a = dev.create_stream() + stream_b = dev.create_stream() + + buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) + + # Make stream_b wait for the allocation on stream_a + alloc_event = dev.create_event(options={"enable_timing": False}) + stream_a.record(alloc_event) + stream_b.wait(alloc_event) + + # Use buffer on stream_b ... + + # Before destroying buffer, make stream_a wait for stream_b's work + done_event = dev.create_event(options={"enable_timing": False}) + stream_b.record(done_event) + stream_a.wait(done_event) + + # Now safe to destroy buffer + del buffer + # [/buffer-lifetime] + # isort: on + + +def numba_stream_example() -> None: + try: + from numba import cuda + except ImportError: + print("Numba not available, skipping numba_stream_example") + return + + # isort: off + # [numba-stream] + import rmm + from rmm.pylibrmm.stream import Stream + from cuda.core import Device + from numba import cuda + + dev = Device() + dev.set_current() + + @cuda.jit + def kernel(data, n): + idx = cuda.grid(1) + if idx < n: + data[idx] = idx * 2 + + mr = rmm.mr.CudaAsyncMemoryResource() + stream = dev.create_stream() + + buffer = rmm.DeviceBuffer(size=1000 * 4, stream=Stream(obj=stream), mr=mr) + + numba_stream = cuda.external_stream(int(stream.handle)) + kernel[100, 10, numba_stream]( + cuda.as_cuda_array(buffer).view("float32"), 1000 + ) + + stream.sync() + # [/numba-stream] + # isort: on + + +if __name__ == "__main__": + how_it_works() + reading_results() + cross_stream() + buffer_lifetime() + numba_stream_example() + + print("All stream_ordered_allocation examples passed.") diff --git a/python/rmm/rmm/tests/test_doc_examples.py b/python/rmm/rmm/tests/test_doc_examples.py new file mode 100644 index 000000000..6130b3876 --- /dev/null +++ b/python/rmm/rmm/tests/test_doc_examples.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import sys +from pathlib import Path + +import pytest + +EXAMPLES_DIR = Path(__file__).parent / "examples" + +EXAMPLE_SCRIPTS = sorted(EXAMPLES_DIR.glob("*.py")) + + +@pytest.mark.parametrize( + "script", + EXAMPLE_SCRIPTS, + ids=[s.stem for s in EXAMPLE_SCRIPTS], +) +def test_doc_example(script): + result = subprocess.run( + [sys.executable, str(script)], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, ( + f"{script.name} failed (exit {result.returncode}):\n{result.stderr}" + )