diff --git a/cpp/examples/README.md b/cpp/examples/README.md index 138f3ac0e..4f0624478 100644 --- a/cpp/examples/README.md +++ b/cpp/examples/README.md @@ -5,3 +5,4 @@ This folder contains examples to demonstrate librmm use cases. Running `build.sh Current examples: - Basic: demonstrates memory resource construction and allocating a `device_uvector` on a stream. +- Docs: code examples from the [user guide](../../docs/user_guide/), included via `literalinclude` directives. diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh index 0545b1e35..cfbb9f8cf 100755 --- a/cpp/examples/build.sh +++ b/cpp/examples/build.sh @@ -1,5 +1,5 @@ #!/bin/bash -# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2025-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # librmm examples build script @@ -58,3 +58,4 @@ build_example() { } build_example basic +build_example docs diff --git a/cpp/examples/docs/CMakeLists.txt b/cpp/examples/docs/CMakeLists.txt new file mode 100644 index 000000000..ee3ffdf12 --- /dev/null +++ b/cpp/examples/docs/CMakeLists.txt @@ -0,0 +1,39 @@ +# cmake-format: off +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 +# cmake-format: on + +cmake_minimum_required(VERSION 3.30.4) + +include(../set_cuda_architecture.cmake) + +# initialize CUDA architectures +rapids_cuda_init_architectures(docs_examples) + +project( + docs_examples + VERSION 0.0.1 + LANGUAGES CXX CUDA) + +include(../fetch_dependencies.cmake) + +include(rapids-cmake) +rapids_cmake_build_type("Release") + +# One executable per user guide page +set(DOCS_CUDA_EXAMPLES guide stream_ordered_allocation managed_memory) +set(DOCS_CXX_EXAMPLES introduction choosing_memory_resources logging installation) + +foreach(example ${DOCS_CUDA_EXAMPLES}) + add_executable(docs_${example} src/${example}.cu) + target_link_libraries(docs_${example} PRIVATE rmm::rmm) + target_compile_features(docs_${example} PRIVATE cxx_std_17) + install(TARGETS docs_${example} DESTINATION bin/examples/librmm) +endforeach() + +foreach(example ${DOCS_CXX_EXAMPLES}) + add_executable(docs_${example} src/${example}.cpp) + target_link_libraries(docs_${example} PRIVATE rmm::rmm) + target_compile_features(docs_${example} PRIVATE cxx_std_17) + install(TARGETS docs_${example} DESTINATION bin/examples/librmm) +endforeach() diff --git a/cpp/examples/docs/src/choosing_memory_resources.cpp b/cpp/examples/docs/src/choosing_memory_resources.cpp new file mode 100644 index 000000000..d8bb33f1d --- /dev/null +++ b/cpp/examples/docs/src/choosing_memory_resources.cpp @@ -0,0 +1,72 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/choosing_memory_resources.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +void recommended_default() +{ + // clang-format off + // [recommended-default] + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), mr); + // [/recommended-default] + // clang-format on + + assert(buffer.size() == 1024); +} + +void managed_pool_prefetch() +{ + // clang-format off + // [managed-pool-prefetch] + #include + #include + #include + #include + #include + + // Use 80% of GPU memory, rounded down to nearest 256 bytes + auto [free_memory, total_memory] = rmm::available_device_memory(); + auto pool_size = rmm::align_down(static_cast(total_memory * 0.8), 256); + + rmm::mr::managed_memory_resource managed_mr; + rmm::mr::pool_memory_resource pool_mr{managed_mr, pool_size}; + rmm::mr::prefetch_resource_adaptor prefetch_mr{pool_mr}; + // [/managed-pool-prefetch] + // clang-format on + + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), prefetch_mr); + assert(buffer.size() == 1024); +} + +int main() +{ + recommended_default(); + managed_pool_prefetch(); + + std::cout << "All choosing_memory_resources examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/guide.cu b/cpp/examples/docs/src/guide.cu new file mode 100644 index 000000000..677914a74 --- /dev/null +++ b/cpp/examples/docs/src/guide.cu @@ -0,0 +1,312 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/guide.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include + +__global__ void trivial_kernel(int* data) { data[0] = 42; } + +void explicit_resource() +{ + // clang-format off + // [explicit-resource] + rmm::mr::cuda_async_memory_resource async_mr; + rmm::cuda_stream stream; + + // Pass the resource explicitly + rmm::device_buffer buffer(1024, stream.view(), async_mr); + // [/explicit-resource] + // clang-format on + + assert(buffer.size() == 1024); +} + +void current_device_resource() +{ + // clang-format off + // [current-device-resource] + #include + #include + + rmm::mr::cuda_async_memory_resource async_mr; + rmm::mr::set_current_device_resource_ref(async_mr); + + // Allocations that don't specify a resource use the current device resource + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource_ref(); + // [/current-device-resource] + // clang-format on + + (void)mr; +} + +void device_buffer_example() +{ + // clang-format off + // [device-buffer] + #include + + rmm::cuda_stream stream; + + // Allocate 1024 bytes + rmm::device_buffer buffer(1024, stream.view()); + + // Access pointer and size + void* ptr = buffer.data(); + std::size_t size = buffer.size(); + + // Resize (may reallocate) + buffer.resize(2048, stream.view()); + + // Copy construct (deep copy) + rmm::device_buffer buffer2(buffer, stream.view()); + // [/device-buffer] + // clang-format on + + assert(buffer.size() == 2048); + assert(buffer2.size() == 2048); + (void)ptr; + (void)size; +} + +void device_uvector_example() +{ + // clang-format off + // [device-uvector] + #include + #include + #include + + rmm::cuda_stream stream; + + // Allocate 100 elements + rmm::device_uvector vec(100, stream.view()); + + // Access as pointer + int* ptr = vec.data(); + + // Access as iterators + auto begin = vec.begin(); + auto end = vec.end(); + + // Initialize with Thrust + thrust::fill(rmm::exec_policy(stream.view()), vec.begin(), vec.end(), 42); + + // Resize + vec.resize(200, stream.view()); + // [/device-uvector] + // clang-format on + + assert(vec.size() == 200); + (void)ptr; + (void)begin; + (void)end; +} + +void device_scalar_example() +{ + // clang-format off + // [device-scalar] + #include + + rmm::cuda_stream stream; + + // Allocate single int + rmm::device_scalar scalar(stream.view()); + + // Set value from host (async on stream) + scalar.set_value(42, stream.view()); + + // Get value to host (async on stream) + int value = scalar.value(stream.view()); + + // Access device pointer + int* d_ptr = scalar.data(); + + // Pass to kernel + trivial_kernel<<<1, 1, 0, stream.value()>>>(scalar.data()); + // [/device-scalar] + // clang-format on + + stream.synchronize(); + assert(value == 42); + (void)d_ptr; +} + +void statistics_tracking() +{ + // clang-format off + // [statistics-tracking] + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + + // Allocate using the statistics-wrapped resource + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), stats_mr); + + // Get statistics + auto bytes = stats_mr.get_bytes_counter(); + std::cout << "Current bytes: " << bytes.value << "\n"; + std::cout << "Peak bytes: " << bytes.peak << "\n"; + std::cout << "Total bytes: " << bytes.total << "\n"; + // [/statistics-tracking] + // clang-format on +} + +void logging_example() +{ + // clang-format off + // [logging] + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "allocations.csv"}; + + // Allocations through log_mr are logged to CSV + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), log_mr); + // [/logging] + // clang-format on + + assert(buffer.size() == 1024); + std::remove("allocations.csv"); +} + +void composing_resources() +{ + // clang-format off + // [composing-resources] + #include + #include + #include + #include + + // Base resource + rmm::mr::cuda_memory_resource cuda_mr; + + // Add pool + rmm::mr::pool_memory_resource pool_mr{cuda_mr, 1ULL << 30}; + + // Add statistics + rmm::mr::statistics_resource_adaptor stats_mr{pool_mr}; + + // Add logging + rmm::mr::logging_resource_adaptor log_mr{stats_mr, "log.csv"}; + + // Use log_mr for allocations — all allocations are pooled, tracked, and logged + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), log_mr); + // [/composing-resources] + // clang-format on + + assert(buffer.size() == 1024); + std::remove("log.csv"); +} + +void thrust_example() +{ + // clang-format off + // [thrust] + #include + #include + #include + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_uvector vec(1000, stream.view(), mr); + + // Fill with descending values + thrust::sequence(rmm::exec_policy_nosync(stream.view(), mr), + vec.begin(), vec.end(), vec.size() - 1, -1); + + // Sort — temporaries allocated from mr + thrust::sort(rmm::exec_policy_nosync(stream.view(), mr), vec.begin(), vec.end()); + + stream.synchronize(); + // [/thrust] + // clang-format on +} + +void multi_device_example() +{ + // clang-format off + // [multi-device] + #include + #include + #include + #include + + int num_devices; + cudaGetDeviceCount(&num_devices); + + // Store resources to maintain lifetime (resources are copyable value types) + std::vector resources; + + for (int i = 0; i < num_devices; ++i) { + // Set device BEFORE creating resource + cudaSetDevice(i); + + // Create resource for this device + resources.emplace_back(); + + // Set as per-device resource ref + rmm::mr::set_per_device_resource_ref(rmm::cuda_device_id{i}, resources.back()); + } + + // Use device 0 + cudaSetDevice(0); + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view()); // Uses device 0's resource + // [/multi-device] + // clang-format on + + assert(buffer.size() == 1024); +} + +int main() +{ + explicit_resource(); + current_device_resource(); + device_buffer_example(); + device_uvector_example(); + device_scalar_example(); + statistics_tracking(); + logging_example(); + composing_resources(); + thrust_example(); + multi_device_example(); + + std::cout << "All guide examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/installation.cpp b/cpp/examples/docs/src/installation.cpp new file mode 100644 index 000000000..a9dc5cfce --- /dev/null +++ b/cpp/examples/docs/src/installation.cpp @@ -0,0 +1,37 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/installation.md + +#include +#include +#include +#include + +#include + +void test_installation() +{ + // clang-format off + // [test-installation] + #include + #include + #include + #include + + auto mr = rmm::mr::cuda_memory_resource{}; + rmm::mr::set_current_device_resource_ref(mr); + + rmm::device_buffer buf(100, rmm::cuda_stream_view{}); + std::cout << "Allocated " << buf.size() << " bytes\n"; + // [/test-installation] + // clang-format on +} + +int main() +{ + test_installation(); + return 0; +} diff --git a/cpp/examples/docs/src/introduction.cpp b/cpp/examples/docs/src/introduction.cpp new file mode 100644 index 000000000..dbf399e26 --- /dev/null +++ b/cpp/examples/docs/src/introduction.cpp @@ -0,0 +1,34 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/introduction.md + +#include +#include +#include + +#include +#include + +void basic_example() +{ + // clang-format off + // [basic-example] + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), mr); + // [/basic-example] + // clang-format on + + assert(buffer.size() == 1024); +} + +int main() +{ + basic_example(); + + std::cout << "All introduction examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/logging.cpp b/cpp/examples/docs/src/logging.cpp new file mode 100644 index 000000000..ac736ac0a --- /dev/null +++ b/cpp/examples/docs/src/logging.cpp @@ -0,0 +1,124 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/logging.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +void logging_adaptor() +{ + // clang-format off + // [logging-adaptor] + #include + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::logging_resource_adaptor log_mr{cuda_mr, "memory_log.csv"}; + + // Allocations through log_mr are logged to CSV + rmm::cuda_stream stream; + rmm::device_buffer buf1(1024, stream.view(), log_mr); + rmm::device_buffer buf2(2048, stream.view(), log_mr); + // [/logging-adaptor] + // clang-format on + + std::remove("memory_log.csv"); +} + +void statistics_adaptor() +{ + // clang-format off + // [statistics-adaptor] + #include + #include + #include + + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + + // Allocate using the statistics-wrapped resource + rmm::cuda_stream stream; + rmm::device_buffer buf1(1024, stream.view(), stats_mr); + rmm::device_buffer buf2(2048, stream.view(), stats_mr); + + // Get statistics + auto bytes = stats_mr.get_bytes_counter(); + auto allocs = stats_mr.get_allocations_counter(); + std::cout << "Current bytes: " << bytes.value << "\n"; + std::cout << "Peak bytes: " << bytes.peak << "\n"; + std::cout << "Allocation count: " << allocs.value << "\n"; + // [/statistics-adaptor] + // clang-format on +} + +void debug_log_level() +{ + // clang-format off + // [debug-log-level] + #include + + rmm::default_logger().set_level(rapids_logger::level_enum::trace); + // [/debug-log-level] + // clang-format on + + // Reset to default + rmm::default_logger().set_level(rapids_logger::level_enum::info); +} + +void combining_features() +{ + // clang-format off + // [combining-features] + #include + #include + #include + #include + + // Set debug log level + rmm::default_logger().set_level(rapids_logger::level_enum::debug); + + // Build resource stack: statistics + logging + rmm::mr::cuda_async_memory_resource cuda_mr; + rmm::mr::statistics_resource_adaptor stats_mr{cuda_mr}; + rmm::mr::logging_resource_adaptor log_mr{stats_mr, "events.csv"}; + + // All allocations through log_mr are tracked and logged + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), log_mr); + + // Get statistics + auto bytes = stats_mr.get_bytes_counter(); + std::cout << "Peak bytes: " << bytes.peak << "\n"; + // [/combining-features] + // clang-format on + + // Reset to default + rmm::default_logger().set_level(rapids_logger::level_enum::info); + std::remove("events.csv"); +} + +int main() +{ + logging_adaptor(); + statistics_adaptor(); + debug_log_level(); + combining_features(); + + std::cout << "All logging examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/managed_memory.cu b/cpp/examples/docs/src/managed_memory.cu new file mode 100644 index 000000000..9adcd2568 --- /dev/null +++ b/cpp/examples/docs/src/managed_memory.cu @@ -0,0 +1,55 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/managed_memory.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include +#include +#include + +#include +#include + +__global__ void trivial_kernel(void* data) {} + +void prefetch_on_access() +{ + dim3 grid(1), block(1); + + // clang-format off + // [prefetch-on-access] + #include + #include + #include + + rmm::mr::managed_memory_resource managed_mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1000000, stream.view(), managed_mr); + + // Prefetch to the current device on this stream + rmm::prefetch(buffer.data(), buffer.size(), + rmm::get_current_cuda_device(), stream.view()); + + // Kernel on the same stream finds the data already resident + trivial_kernel<<>>(buffer.data()); + // [/prefetch-on-access] + // clang-format on + + stream.synchronize(); +} + +int main() +{ + prefetch_on_access(); + + std::cout << "All managed_memory examples passed.\n"; + return 0; +} diff --git a/cpp/examples/docs/src/stream_ordered_allocation.cu b/cpp/examples/docs/src/stream_ordered_allocation.cu new file mode 100644 index 000000000..c85e1dabc --- /dev/null +++ b/cpp/examples/docs/src/stream_ordered_allocation.cu @@ -0,0 +1,143 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. + * SPDX-License-Identifier: Apache-2.0 + */ + +// Code examples for docs/user_guide/stream_ordered_allocation.md +// +// Include directives that appear inside function bodies are intentional: +// they are no-ops (headers use #pragma once) and exist so that +// literalinclude snippets display the includes alongside the code. + +#include +#include +#include + +#include +#include +#include +#include + +__global__ void trivial_kernel(void* data) {} + +void how_it_works() +{ + dim3 grid(1), block(1); + + // clang-format off + // [how-it-works] + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer buffer(1000, stream.view(), mr); + + // buffer.data() is usable immediately in stream-ordered operations + trivial_kernel<<>>(buffer.data()); + // [/how-it-works] + // clang-format on + + stream.synchronize(); +} + +void reading_results() +{ + // clang-format off + // [reading-results] + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream; + rmm::device_buffer d_buf(1000 * sizeof(float), stream.view(), mr); + + // Launch kernel that writes to d_buf on stream ... + + // Copy results to host on the same stream + std::vector h_buf(1000); + cudaMemcpyAsync(h_buf.data(), d_buf.data(), d_buf.size(), + cudaMemcpyDeviceToHost, stream.value()); + + // Synchronize before reading h_buf on the CPU + stream.synchronize(); + // [/reading-results] + // clang-format on +} + +void cross_stream() +{ + dim3 grid(1), block(1); + + // clang-format off + // [cross-stream] + #include + #include + + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream_a; + rmm::cuda_stream stream_b; + + rmm::device_buffer buffer(1000, stream_a.view(), mr); + + // Record an event after the allocation on stream_a + cudaEvent_t event; + cudaEventCreateWithFlags(&event, cudaEventDisableTiming); + cudaEventRecord(event, stream_a.value()); + + // stream_b waits for the event — no CPU synchronization needed + cudaStreamWaitEvent(stream_b.value(), event); + + // Now safe to use buffer.data() in operations on stream_b + trivial_kernel<<>>(buffer.data()); + + cudaEventDestroy(event); + // [/cross-stream] + // clang-format on + + stream_b.synchronize(); +} + +void buffer_lifetime() +{ + dim3 grid(1), block(1); + + // clang-format off + // [buffer-lifetime] + rmm::mr::cuda_async_memory_resource mr; + rmm::cuda_stream stream_a; + rmm::cuda_stream stream_b; + + rmm::device_buffer buffer(1000, stream_a.view(), mr); + + // Make stream_b wait for the allocation on stream_a + cudaEvent_t alloc_event; + cudaEventCreateWithFlags(&alloc_event, cudaEventDisableTiming); + cudaEventRecord(alloc_event, stream_a.value()); + cudaStreamWaitEvent(stream_b.value(), alloc_event); + + // Use buffer on stream_b + trivial_kernel<<>>(buffer.data()); + + // Before destroying buffer, make stream_a wait for stream_b's work + cudaEvent_t done_event; + cudaEventCreateWithFlags(&done_event, cudaEventDisableTiming); + cudaEventRecord(done_event, stream_b.value()); + cudaStreamWaitEvent(stream_a.value(), done_event); + + // Now safe to destroy buffer — deallocation on stream_a is ordered after the kernel on stream_b + buffer = rmm::device_buffer{}; + + cudaEventDestroy(alloc_event); + cudaEventDestroy(done_event); + // [/buffer-lifetime] + // clang-format on +} + +int main() +{ + how_it_works(); + reading_results(); + cross_stream(); + buffer_lifetime(); + + std::cout << "All stream_ordered_allocation examples passed.\n"; + return 0; +} diff --git a/docs/conf.py b/docs/conf.py index db45dff09..c090697b8 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2020-2025, NVIDIA CORPORATION. +# SPDX-FileCopyrightText: Copyright (c) 2020-2026, NVIDIA CORPORATION. # SPDX-License-Identifier: Apache-2.0 # Configuration file for the Sphinx documentation builder. @@ -57,6 +57,7 @@ "sphinx.ext.intersphinx", "sphinx_copybutton", "sphinx_markdown_tables", + "sphinx_tabs.tabs", "sphinxcontrib.jquery", ] diff --git a/docs/index.md b/docs/index.md index d95428e72..21c8db3e7 100644 --- a/docs/index.md +++ b/docs/index.md @@ -6,7 +6,7 @@ RMM (RAPIDS Memory Manager) is a library for allocating and managing GPU memory :maxdepth: 2 :caption: Contents -user_guide/guide +user_guide/index cpp/index python/index ``` diff --git a/docs/user_guide/choosing_memory_resources.md b/docs/user_guide/choosing_memory_resources.md new file mode 100644 index 000000000..04a8acd73 --- /dev/null +++ b/docs/user_guide/choosing_memory_resources.md @@ -0,0 +1,228 @@ +# Choosing a Memory Resource + +One of the most common questions when using RMM is: "Which memory resource should I use?" + +This guide recommends memory resources based on optimal allocation performance for common workloads. See the API references for the full list of available resources. + +## Recommended Defaults + +For most applications, the CUDA async memory pool provides the best allocation performance with no tuning required. + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/choosing_memory_resources.cpp +--- +language: cpp +start-after: "// [recommended-default]" +end-before: "// [/recommended-default]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [recommended-default]" +end-before: "# [/recommended-default]" +dedent: +--- +``` +```` +````` + +For applications that require GPU memory oversubscription (allocating more memory than physically available on the GPU), use a pooled managed memory resource with prefetching. This uses [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) (`cudaMallocManaged`) to enable automatic page migration between CPU and GPU at the cost of slower allocation performance. Coupling the managed memory "base" allocator with adaptors for pool allocation and prefetching to device on allocation recovers some of the performance lost to the overhead of managed allocations. Note: Managed memory has [limited support on WSL2](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#unified-memory-on-windows-wsl-and-tegra). + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/choosing_memory_resources.cpp +--- +language: cpp +start-after: "// [managed-pool-prefetch]" +end-before: "// [/managed-pool-prefetch]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [managed-pool-prefetch]" +end-before: "# [/managed-pool-prefetch]" +dedent: +--- +``` +```` +````` + +## Memory Resource Considerations + +Resources that use the CUDA driver's pool suballocation (`cudaMallocFromPoolAsync`) provide fast allocation performance because the driver can manage virtual address space efficiently and reduce fragmentation. + +### CUDA Async Memory Resource + +{cpp:class}`~rmm::mr::cuda_async_memory_resource` (C++) / {py:class}`~rmm.mr.CudaAsyncMemoryResource` (Python) allocates from a custom CUDA memory pool using `cudaMallocFromPoolAsync`. This is the **recommended default** for most applications. + +Note: This creates a *custom* mempool, not the default device mempool. A custom pool is used to enable features like Blackwell decompression engine support and custom release thresholds. + +**Features:** +- **Fast allocation**: Driver-managed pool reuses previously allocated memory +- **Reduced fragmentation**: Virtual addressing allows non-contiguous physical memory to back contiguous allocations, unlike `PoolMemoryResource` which requires contiguous free regions +- **Stream-ordered semantics**: Allocations and deallocations are stream-ordered by default, avoiding pipeline stalls in multi-stream workloads +- **Low configuration**: The driver manages pool growth automatically, though release threshold and maximum size may need tuning in some environments (e.g., when co-existing with libraries that allocate outside the pool) + +**When to use:** +- Default choice for GPU-accelerated applications +- Multi-stream or multi-threaded applications +- Most production workloads + +### CUDA Memory Resource + +{cpp:class}`~rmm::mr::cuda_memory_resource` (C++) / {py:class}`~rmm.mr.CudaMemoryResource` (Python) uses the legacy `cudaMalloc`/`cudaFree` APIs directly with no pooling or stream-ordering support. It is generally not recommended. + +**When to use:** +- Debugging memory issues (to isolate allocator-related problems) +- Benchmarking baseline allocation overhead + +### Managed Memory Resource + +{cpp:class}`~rmm::mr::managed_memory_resource` (C++) / {py:class}`~rmm.mr.ManagedMemoryResource` (Python) allocates [CUDA Unified Memory](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) via `cudaMallocManaged`. Unified Memory creates a single address space accessible from both CPU and GPU, with the CUDA driver migrating pages between processors on demand. This enables [GPU memory oversubscription](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html) — allocating more memory than physically available on the GPU — but generally comes with a performance cost. + +**Features:** +- Enables GPU memory oversubscription for datasets larger than GPU memory +- Automatic page migration between CPU and GPU + +**Caution:** +By default, managed memory adds overhead for page faults and migration (see [Performance Tuning](https://docs.nvidia.com/cuda/cuda-programming-guide/04-special-topics/unified-memory.html#performance-tuning) in the CUDA Programming Guide). See the [Managed Memory guide](managed_memory.md) for a recommended solution with a pool and prefetching adaptor. + +**When to use:** +- Datasets larger than available GPU memory +- Typically combined with a pool and prefetching (see [Managed Memory guide](managed_memory.md)) + +**Example:** + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [managed-memory-example]" +end-before: "# [/managed-memory-example]" +dedent: +--- +``` + +### Pool Memory Resource + +{cpp:class}`~rmm::mr::pool_memory_resource` (C++) / {py:class}`~rmm.mr.PoolMemoryResource` (Python) maintains a pool of memory allocated from an upstream resource, providing fast suballocation. + +**Features:** +- Fast suballocation from pre-allocated pool +- Configurable initial and maximum pool sizes for explicit memory budgeting + +**When to use:** +- The [Managed Memory guide](managed_memory.md) provides a good example of usage, because initial allocations of managed memory can be slow. The pool resource amortizes that initial cost over the lifetime of the pool. + +**Caution:** +There are pool implementations in both RMM (this memory resource) and in the CUDA driver (leveraging `cudaMallocFromPoolAsync` and `cudaMemPool_t`). +The RMM pool implementation is not as good at handling fragmentation compared to the CUDA driver. +Also, RMM's pool can be slower than the CUDA driver's pool implementation in heavy multi-stream workloads depending on application details. + +**Note**: `PoolMemoryResource` does not return memory to the upstream resource on deallocation. Once the pool grows, that memory stays allocated until the resource is destroyed. Set `maximum_pool_size` to limit growth. + +**Example:** + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [pool-memory-example]" +end-before: "# [/pool-memory-example]" +dedent: +--- +``` + +## Composing Memory Resources + +Memory resources can be composed (wrapped) to combine their properties. The general pattern is: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [composing-adaptor]" +end-before: "# [/composing-adaptor]" +dedent: +--- +``` + +### Common Compositions + +**Prefetching with managed memory:** + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [prefetch-composition]" +end-before: "# [/prefetch-composition]" +dedent: +--- +``` + +**Statistics tracking** (see [Logging and Profiling](logging.md)): + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [statistics-composition]" +end-before: "# [/statistics-composition]" +dedent: +--- +``` + +**Allocation logging** (see [Logging and Profiling](logging.md)): + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [logging-composition]" +end-before: "# [/logging-composition]" +dedent: +--- +``` + +## Multi-Library Applications + +When using RMM with multiple GPU libraries (e.g., cuDF, PyTorch, CuPy), configuring each library to allocate through RMM ensures all allocations flow through the same resource. This avoids memory partitioning where each library holds its own pool, leaving less memory available for the others. + +Each library must be explicitly configured to use RMM. RMM provides allocator integrations for common libraries: + +**Example: RMM + PyTorch** + +```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py +--- +language: python +start-after: "# [multi-library-pytorch]" +end-before: "# [/multi-library-pytorch]" +dedent: +--- +``` + +With this setup, both PyTorch and any other RMM-configured library (like cuDF) allocate from the same resource. + +## Best Practices + +1. **Set the memory resource before any allocations**: Changing the resource after allocations have been made can lead to crashes. + + ```{literalinclude} ../../python/rmm/rmm/tests/examples/choosing_memory_resources.py + --- + language: python + start-after: "# [best-practices-set-early]" + end-before: "# [/best-practices-set-early]" + dedent: + --- + ``` + +2. **Use adaptors for diagnostics**: Wrap with {cpp:class}`~rmm::mr::statistics_resource_adaptor` (C++) / {py:class}`~rmm.mr.StatisticsResourceAdaptor` (Python) to track allocation counts and peak usage, or {cpp:class}`~rmm::mr::logging_resource_adaptor` (C++) / {py:class}`~rmm.mr.LoggingResourceAdaptor` (Python) to log every allocation and deallocation (see [Logging and Profiling](logging.md)). + +## See Also + +- [Managed Memory](managed_memory.md) - Guide to using managed memory and prefetching +- [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding stream-ordered semantics diff --git a/docs/user_guide/guide.md b/docs/user_guide/guide.md index b6923257b..839025e40 100644 --- a/docs/user_guide/guide.md +++ b/docs/user_guide/guide.md @@ -1,338 +1,334 @@ -# User Guide +# Programming Guide -Achieving optimal performance in GPU-centric workflows frequently requires -customizing how GPU ("device") memory is allocated. +This guide covers using RMM in C++ and Python applications, including memory resources, containers, and library integrations. -RMM is a package that enables you to allocate device memory -in a highly configurable way. For example, it enables you to -allocate and use pools of GPU memory, or to use -[managed memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) -for allocations. +## Basic Example -You can also easily configure other libraries like Numba and CuPy -to use RMM for allocating device memory. +`````{tabs} +````{code-tab} c++ +#include +#include +#include -## Installation +int main() { + // Create a memory resource + rmm::mr::cuda_async_memory_resource async_mr; -See the project [README](https://github.com/rapidsai/rmm) for how to install RMM. + // Allocate device memory using the resource + rmm::cuda_stream stream; + rmm::device_buffer buffer(1024, stream.view(), async_mr); -## Using RMM + std::cout << "Allocated " << buffer.size() << " bytes\n"; -There are two ways to use RMM in Python code: + return 0; +} +```` +````{code-tab} python +import rmm -1. Using the `rmm.DeviceBuffer` API to explicitly create and manage - device memory allocations -2. Transparently via external libraries such as CuPy and Numba +# Create a memory resource +mr = rmm.mr.CudaAsyncMemoryResource() -RMM provides a `MemoryResource` abstraction to control _how_ device -memory is allocated in both the above uses. +# Allocate device memory using the resource +buffer = rmm.DeviceBuffer(size=1024, mr=mr) -### `DeviceBuffer` Objects +print(f"Allocated {buffer.size} bytes at {hex(buffer.ptr)}") +```` +````` -A `DeviceBuffer` represents an **untyped, uninitialized device memory -allocation**. `DeviceBuffer`s can be created by providing the -size of the allocation in bytes: +## Memory Resources -```python ->>> import rmm ->>> buf = rmm.DeviceBuffer(size=100) -``` - -The size of the allocation and the memory address associated with it -can be accessed via the `.size` and `.ptr` attributes respectively: +Memory resources control how device memory is allocated. RMM provides several resource types optimized for different use cases. -```python ->>> buf.size -100 ->>> buf.ptr -140202544726016 -``` +### Explicit Resource Passing -`DeviceBuffer`s can also be created by copying data from host memory: +The preferred way to use a memory resource is to pass it explicitly when allocating memory. This makes it clear which resource handles each allocation: -```python ->>> import rmm ->>> import numpy as np ->>> a = np.array([1, 2, 3], dtype='float64') ->>> buf = rmm.DeviceBuffer.to_device(a.view("uint8")) # to_device expects an unsigned 8-bit dtype ->>> buf.size -24 +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [explicit-resource]" +end-before: "// [/explicit-resource]" +dedent: +--- ``` - -Conversely, the data underlying a `DeviceBuffer` can be copied to the host: - -```python ->>> np.frombuffer(buf.tobytes()) -array([1., 2., 3.]) +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [explicit-resource]" +end-before: "# [/explicit-resource]" +dedent: +--- ``` - -#### Prefetching a `DeviceBuffer` - -[CUDA Unified Memory]( - https://developer.nvidia.com/blog/unified-memory-cuda-beginners/ -), also known as managed memory, can be allocated using an -`rmm.mr.ManagedMemoryResource` explicitly, or by calling `rmm.reinitialize` -with `managed_memory=True`. - -A `DeviceBuffer` backed by managed memory or other -migratable memory (such as -[HMM/ATS](https://developer.nvidia.com/blog/simplifying-gpu-application-development-with-heterogeneous-memory-management/) -memory) may be prefetched to a specified device, for example to reduce or eliminate page faults. - -```python ->>> import rmm ->>> rmm.reinitialize(managed_memory=True) ->>> buf = rmm.DeviceBuffer(size=100) ->>> buf.prefetch() +```` +````` + +### Setting the Current Device Resource + +RMM also provides a global "current device resource" that is used when no resource is passed explicitly: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [current-device-resource]" +end-before: "// [/current-device-resource]" +dedent: +--- ``` - -The above example prefetches the `DeviceBuffer` memory to the current CUDA device -on the stream that the `DeviceBuffer` last used (e.g. at construction). The -destination device ID and stream are optional parameters. - -```python ->>> import rmm ->>> rmm.reinitialize(managed_memory=True) ->>> from rmm.pylibrmm.stream import Stream ->>> stream = Stream() ->>> buf = rmm.DeviceBuffer(size=100, stream=stream) ->>> buf.prefetch(device=3, stream=stream) # prefetch to device on stream. +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [current-device-resource]" +end-before: "# [/current-device-resource]" +dedent: +--- ``` +```` +````` -`DeviceBuffer.prefetch()` is a no-op if the `DeviceBuffer` is not backed -by migratable memory. +> **Warning**: The default resource must be set **before** allocating any device memory on that device. Setting or changing the resource after device allocations have been made can lead to unexpected behavior or crashes. -`rmm.pylibrmm.stream.Stream` implements the [CUDA Stream Protocol](https://nvidia.github.io/cuda-python/cuda-core/latest/interoperability.html#cuda-stream-protocol), so it can be used with -`cuda.core.`. +### Available Resources -```python ->>> from cuda.core import Device ->>> import rmm.pylibrmm.stream ->>> device = Device() ->>> device.set_current() ->>> rmm_stream = rmm.pylibrmm.stream.Stream() +RMM provides base memory resources (e.g., {py:class}`~rmm.mr.CudaAsyncMemoryResource`, {py:class}`~rmm.mr.ManagedMemoryResource`) and resource adaptors (e.g., {py:class}`~rmm.mr.PoolMemoryResource`, {py:class}`~rmm.mr.StatisticsResourceAdaptor`) that wrap an upstream resource to add functionality. See [Choosing a Memory Resource](choosing_memory_resources.md) for recommendations and the API references ([C++ memory resources](../cpp/memory_resources/memory_resources.md), [C++ adaptors](../cpp/memory_resources/memory_resource_adaptors.md), [Python](../python/mr.md)) for the full list. ->>> cuda_stream = device.create_stream(rmm_stream) -``` +## Containers -### `MemoryResource` objects +RMM provides RAII containers that automatically manage device memory lifetime. -`MemoryResource` objects are used to configure how device memory allocations are made by -RMM. +### DeviceBuffer -By default if a `MemoryResource` is not set explicitly, RMM uses the `CudaMemoryResource`, which -uses `cudaMalloc` for allocating device memory. +Untyped, uninitialized device memory ({cpp:class}`C++ `, {py:class}`Python `): -`rmm.reinitialize()` provides an easy way to initialize RMM with specific memory resource options -across multiple devices. See `help(rmm.reinitialize)` for full details. - -For lower-level control, the `rmm.mr.set_current_device_resource()` function can be -used to set a different MemoryResource for the current CUDA device. For -example, enabling the `ManagedMemoryResource` tells RMM to use -`cudaMallocManaged` instead of `cudaMalloc` for allocating memory: - -```python ->>> import rmm ->>> rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource()) +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [device-buffer]" +end-before: "// [/device-buffer]" +dedent: +--- ``` - -> :warning: The default resource must be set for any device **before** -> allocating any device memory on that device. Setting or changing the -> resource after device allocations have been made can lead to unexpected -> behaviour or crashes. - -As another example, `PoolMemoryResource` allows you to allocate a -large "pool" of device memory up-front. Subsequent allocations will -draw from this pool of already allocated memory. The example -below shows how to construct a PoolMemoryResource with an initial size -of 1 GiB and a maximum size of 4 GiB. The pool uses -`CudaMemoryResource` as its underlying ("upstream") memory resource: - -```python ->>> import rmm ->>> pool = rmm.mr.PoolMemoryResource( -... rmm.mr.CudaMemoryResource(), -... initial_pool_size="1GiB", # equivalent to initial_pool_size=2**30 -... maximum_pool_size="4GiB" -... ) ->>> rmm.mr.set_current_device_resource(pool) +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [device-buffer]" +end-before: "# [/device-buffer]" +dedent: +--- ``` +```` +````` -Similarly, to use a pool of managed memory: +### device_uvector (C++) -```python ->>> import rmm ->>> pool = rmm.mr.PoolMemoryResource( -... rmm.mr.ManagedMemoryResource(), -... initial_pool_size="1GiB", -... maximum_pool_size="4GiB" -... ) ->>> rmm.mr.set_current_device_resource(pool) -``` +Typed, uninitialized device vector for trivially copyable types ({cpp:class}`API `): -Other `MemoryResource`s include: +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [device-uvector]" +end-before: "// [/device-uvector]" +dedent: +--- +``` -* `FixedSizeMemoryResource` for allocating fixed blocks of memory -* `BinningMemoryResource` for allocating blocks within specified "bin" sizes from different memory -resources +### device_scalar (C++) -`MemoryResource`s are highly configurable and can be composed together in different ways. -See `help(rmm.mr)` for more information. +Single typed element with host-device transfer convenience ({cpp:class}`API `): -## Using RMM with third-party libraries +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [device-scalar]" +end-before: "// [/device-scalar]" +dedent: +--- +``` -A number of libraries provide hooks to control their device -allocations. RMM provides implementations of these for -[CuPy](https://cupy.dev), -[numba](https://numba.readthedocs.io/en/stable/), and [PyTorch](https://pytorch.org) in the -`rmm.allocators` submodule. All these approaches configure the library -to use the _current_ RMM memory resource for device -allocations. +## Resource Adaptors -### Using RMM with CuPy +Adaptors wrap resources to add functionality like statistics tracking and logging. -You can configure [CuPy](https://cupy.dev/) to use RMM for memory -allocations by setting the CuPy CUDA allocator to -`rmm.allocators.cupy.rmm_cupy_allocator`: +### Statistics Tracking -```python ->>> from rmm.allocators.cupy import rmm_cupy_allocator ->>> import cupy ->>> cupy.cuda.set_allocator(rmm_cupy_allocator) +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [statistics-tracking]" +end-before: "// [/statistics-tracking]" +dedent: +--- ``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [statistics-tracking]" +end-before: "# [/statistics-tracking]" +dedent: +--- +``` +```` +````` + +### Logging + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [logging]" +end-before: "// [/logging]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [logging]" +end-before: "# [/logging]" +dedent: +--- +``` +```` +````` -### Using RMM with Numba - -You can configure [Numba](https://numba.readthedocs.io/en/stable/) to use RMM for memory allocations using the -Numba [EMM Plugin](https://numba.readthedocs.io/en/stable/cuda/external-memory.html#setting-emm-plugin). +CSV format: `Thread,Time,Action,Pointer,Size,Stream` -This can be done in two ways: +See [Logging and Profiling](logging.md) for more details. -1. Setting the environment variable `NUMBA_CUDA_MEMORY_MANAGER`: +### Composing Resources - ```bash - $ NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python (args) - ``` +Adaptors can be stacked to combine functionality: -2. Using the `set_memory_manager()` function provided by Numba: +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [composing-resources]" +end-before: "// [/composing-resources]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [composing-resources]" +end-before: "# [/composing-resources]" +dedent: +--- +``` +```` +````` - ```python - >>> from numba import cuda - >>> from rmm.allocators.numba import RMMNumbaManager - >>> cuda.set_memory_manager(RMMNumbaManager) - ``` +Order matters: outer adaptors see all allocations from inner resources. -### Using RMM with PyTorch +## Library Integrations -You can configure -[PyTorch](https://pytorch.org/docs/stable/notes/cuda.html) to use RMM -for memory allocations using their by configuring the current -allocator. +### Thrust (C++) -```python ->>> from rmm.allocators.torch import rmm_torch_allocator ->>> import torch +Use {cpp:class}`rmm::exec_policy_nosync` to make Thrust algorithms use RMM for temporary storage. Passing the resource explicitly makes it clear which resource handles temporaries: ->>> torch.cuda.memory.change_current_allocator(rmm_torch_allocator) +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [thrust]" +end-before: "// [/thrust]" +dedent: +--- ``` -## Memory statistics and profiling +`exec_policy_nosync` allows the Thrust backend to skip stream synchronizations that are not required for correctness, improving performance. Stream-ordered applications using RMM should always prefer `exec_policy_nosync`. If stream synchronizations are required, the application should insert them explicitly before reading device data from the host. -RMM can profile memory usage and track memory statistics by using either of the following: - - Use the context manager `rmm.statistics.statistics()` to enable statistics tracking for a specific code block. - - Call `rmm.statistics.enable_statistics()` to enable statistics tracking globally. +### CuPy (Python) -Common to both usages is that they modify the currently active RMM memory resource. The current device resource is wrapped with a `StatisticsResourceAdaptor` which must remain the topmost resource throughout the statistics tracking: -```python ->>> import rmm ->>> import rmm.statistics +Configure CuPy to use RMM for all device memory allocations ({py:func}`API `): ->>> # We start with the default CUDA memory resource ->>> rmm.mr.get_current_device_resource() - +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [cupy]" +end-before: "# [/cupy]" +dedent: +--- +``` ->>> # When using statistics, we get a StatisticsResourceAdaptor with the context ->>> with rmm.statistics.statistics(): -... rmm.mr.get_current_device_resource() - +### Numba (Python) ->>> # We can also enable statistics globally ->>> rmm.statistics.enable_statistics() ->>> print(rmm.mr.get_current_device_resource()) - -``` +Configure Numba to use RMM for device memory in CUDA JIT-compiled functions ({py:class}`API `): -With statistics enabled, you can query statistics of the current and peak bytes and number of allocations performed by the current RMM memory resource: -```python ->>> buf = rmm.DeviceBuffer(size=10) ->>> rmm.statistics.get_statistics() -Statistics(current_bytes=16, current_count=1, peak_bytes=16, peak_count=1, total_bytes=16, total_count=1) +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [numba]" +end-before: "# [/numba]" +dedent: +--- ``` -### Memory Profiler -To profile a specific block of code, first enable memory statistics by calling `rmm.statistics.enable_statistics()`. To profile a function, use `profiler` as a function decorator: -```python ->>> @rmm.statistics.profiler() -... def f(size): -... rmm.DeviceBuffer(size=size) ->>> f(1000) +Or use the environment variable: ->>> # By default, the profiler write to rmm.statistics.default_profiler_records ->>> print(rmm.statistics.default_profiler_records.report()) -Memory Profiling -================ +```bash +NUMBA_CUDA_MEMORY_MANAGER=rmm.allocators.numba python script.py +``` -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) +### PyTorch (Python) -Ordered by: memory_peak +Configure PyTorch to use RMM for CUDA tensor allocations ({py:func}`API `): -ncalls memory_peak memory_total filename:lineno(function) - 1 1,008 1,008 :1(f) +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [pytorch]" +end-before: "# [/pytorch]" +dedent: +--- ``` -To profile a code block, use `profiler` as a context manager: -```python ->>> with rmm.statistics.profiler(name="my code block"): -... rmm.DeviceBuffer(size=20) ->>> print(rmm.statistics.default_profiler_records.report()) -Memory Profiling -================ - -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) +## Multi-Device Usage -Ordered by: memory_peak +For multi-GPU systems, each device can have its own memory resource. Use `set_per_device_resource_ref` (C++) or `set_per_device_resource` (Python) to configure each device before allocating memory on it: -ncalls memory_peak memory_total filename:lineno(function) - 1 1,008 1,008 :1(f) - 1 32 32 my code block +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/guide.cu +--- +language: cuda +start-after: "// [multi-device]" +end-before: "// [/multi-device]" +dedent: +--- ``` - -The `profiler` supports nesting: -```python ->>> with rmm.statistics.profiler(name="outer"): -... buf1 = rmm.DeviceBuffer(size=10) -... with rmm.statistics.profiler(name="inner"): -... buf2 = rmm.DeviceBuffer(size=10) ->>> print(rmm.statistics.default_profiler_records.report()) -Memory Profiling -================ - -Legends: - ncalls - number of times the function or code block was called - memory_peak - peak memory allocated in function or code block (in bytes) - memory_total - total memory allocated in function or code block (in bytes) - -Ordered by: memory_peak - -ncalls memory_peak memory_total filename:lineno(function) - 1 1,008 1,008 :1(f) - 1 32 32 my code block - 1 32 32 outer - 1 16 16 inner +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/guide.py +--- +language: python +start-after: "# [multi-device]" +end-before: "# [/multi-device]" +dedent: +--- ``` +```` +````` diff --git a/docs/user_guide/index.md b/docs/user_guide/index.md new file mode 100644 index 000000000..801a19d21 --- /dev/null +++ b/docs/user_guide/index.md @@ -0,0 +1,13 @@ +# User Guide + +```{toctree} +:maxdepth: 2 + +introduction +installation +guide +choosing_memory_resources +stream_ordered_allocation +managed_memory +logging +``` diff --git a/docs/user_guide/installation.md b/docs/user_guide/installation.md new file mode 100644 index 000000000..46e781630 --- /dev/null +++ b/docs/user_guide/installation.md @@ -0,0 +1,145 @@ +# Installation + +This guide covers installing RMM. For general RAPIDS installation instructions, which includes RMM, see the [RAPIDS Installation Guide](https://docs.rapids.ai/install/). + +## System Requirements + +See the [RAPIDS Platform Support](https://docs.rapids.ai/platform-support/) for supported operating systems, CUDA versions, GPU architectures, and Python versions for each release. + +## Installing with conda + +The easiest way to install RMM and all of its dependencies is using conda. You can get a minimal conda installation with [miniforge](https://conda-forge.org/download/). + +### Stable Release + +Install the latest stable release: + +```bash +conda install -c rapidsai -c conda-forge rmm cuda-version=13 +``` + +The `cuda-version` metapackage selects the CUDA Toolkit major version, and requires a CUDA driver to be installed from that major version or newer. + +### Nightly Builds + +For the latest development version, install from the nightly channel: + +```bash +conda install -c rapidsai-nightly -c conda-forge rmm cuda-version=13 +``` + +Nightly builds are created from the `main` branch and may contain unreleased features or bug fixes. They provide no stability guarantees. + +## Installing with pip + +RMM can also be installed using pip. The CUDA driver must already be installed on your system. + +```bash +pip install rmm-cu13 # For CUDA 13 +# or +pip install rmm-cu12 # For CUDA 12 +``` + +## Building from Source + +Building from source gives you the latest features and allows you to customize the build. + +### Clone and Create Development Environment + +The conda environment files in `conda/environments/` pin all build prerequisites (compiler, CUDA toolkit, CMake, etc.) to known-good versions: + +```bash +git clone https://github.com/rapidsai/rmm.git +cd rmm + +# Create environment for CUDA 13 +conda env create --name rmm_dev --file conda/environments/all_cuda-131_arch-$(uname -m).yaml +conda activate rmm_dev +``` + +### Build Using build.sh + +RMM provides a convenience script `build.sh` that handles the build process. +The `build.sh` script is meant to be used with the developer conda environment above, which installs all prerequisites. + +```bash +# Show help +./build.sh -h + +# Build librmm without installing +./build.sh -n librmm + +# Build rmm Python package without installing +./build.sh -n rmm + +# Build and install both +./build.sh librmm rmm +``` + +## Using RMM in a Downstream CMake Project + +To use RMM in your own CMake project, add the following to your `CMakeLists.txt`: + +```cmake +find_package(rmm REQUIRED) + +# Link your target with RMM +target_link_libraries(your_target PRIVATE rmm::rmm) +``` + +If RMM is not installed in a default location, specify its path: + +```bash +cmake .. -Drmm_ROOT=/path/to/rmm/install +``` + +### Using CPM to Fetch RMM + +You can use CPM to fetch RMM as a dependency: + +```cmake +include(CPM) + +CPMAddPackage( + NAME rmm + VERSION 26.06 + GITHUB_REPOSITORY rapidsai/rmm + GIT_TAG main + SOURCE_SUBDIR cpp +) + +target_link_libraries(your_target PRIVATE rmm::rmm) +``` + +## Testing Installation + +### C++ + +Create a test file `test_rmm.cpp`: + +```{literalinclude} ../../cpp/examples/docs/src/installation.cpp +--- +language: cpp +start-after: "// [test-installation]" +end-before: "// [/test-installation]" +dedent: +--- +``` + +Compile and run: + +```bash +nvcc -std=c++17 -I/path/to/rmm/include test_rmm.cpp -o test_rmm +./test_rmm +``` + +### Python + +```{literalinclude} ../../python/rmm/rmm/tests/examples/installation.py +--- +language: python +start-after: "# [test-installation]" +end-before: "# [/test-installation]" +dedent: +--- +``` diff --git a/docs/user_guide/introduction.md b/docs/user_guide/introduction.md new file mode 100644 index 000000000..d12e045c5 --- /dev/null +++ b/docs/user_guide/introduction.md @@ -0,0 +1,61 @@ +# Introduction to RMM + +RMM (RAPIDS Memory Manager) is a C++ and Python library for GPU memory allocation. It provides a common interface — the **memory resource** — that lets you swap allocation strategies at runtime without recompiling, and a set of containers that manage device memory lifetime automatically. + +GPU applications often benefit from customizing how memory is allocated. For example, pooling reduces the overhead of frequent small allocations, managed memory enables working with datasets larger than GPU memory, and pinned host memory speeds up CPU-GPU transfers compared to pageable host memory. RMM provides these and other features as interchangeable memory resources, so you can experiment with different strategies and measure their impact on your workload. + +RMM provides integrations with GPU libraries including cuDF, cuML, cuGraph, PyTorch, and CuPy, enabling uniform memory handling across your application. + +## Key Concepts + +### Memory Resources + +A memory resource is an object that knows how to allocate and deallocate memory. The choice of resource determines the kind of memory (device, host, managed, pinned) and the allocation strategy (pooled, stream-ordered, etc.). RMM's resources implement the `cuda::mr::resource` concept defined by [CCCL](https://github.com/NVIDIA/cccl) (CUDA Core Compute Libraries), so they interoperate directly with any library that accepts CCCL resources. See the API references for the full list ([C++ memory resources](../cpp/memory_resources/memory_resources.md), [C++ adaptors](../cpp/memory_resources/memory_resource_adaptors.md), [Python](../python/mr.md)). + +For most applications, the CUDA async memory resource ({cpp:class}`~rmm::mr::cuda_async_memory_resource` in C++, {py:class}`~rmm.mr.CudaAsyncMemoryResource` in Python) is a good starting point — it uses a CUDA driver-managed pool and supports stream-ordered (asynchronous) allocations. See [Choosing a Memory Resource](choosing_memory_resources.md) for guidance on when to use other resources. + +### Resource Adaptors + +Resource adaptors wrap an existing resource to add functionality. For example, {py:class}`~rmm.mr.StatisticsResourceAdaptor` tracks allocation statistics, and {py:class}`~rmm.mr.LoggingResourceAdaptor` logs allocations to a CSV file. Adaptors are composable — you can stack several to get combined functionality. See [Logging and Profiling](logging.md) for details and the API references for the full list ([C++](../cpp/memory_resources/memory_resource_adaptors.md), [Python](../python/mr.md)). + +### Containers + +RMM provides [RAII](https://en.cppreference.com/w/cpp/language/raii.html) containers that manage device memory lifetime, avoiding common problems like memory leaks or improper stream ordering: + +- C++: {cpp:class}`~rmm::device_buffer` (untyped), {cpp:class}`~rmm::device_uvector` (typed, uninitialized), {cpp:class}`~rmm::device_scalar` (single element) +- Python: {py:class}`~rmm.DeviceBuffer` (untyped) + +All containers accept a stream and a memory resource, and use stream-ordered allocation. + +## Basic Example + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/introduction.cpp +--- +language: cpp +start-after: "// [basic-example]" +end-before: "// [/basic-example]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/introduction.py +--- +language: python +start-after: "# [basic-example]" +end-before: "# [/basic-example]" +--- +``` +```` +````` + +## Resources and Support + +- [RMM GitHub Repository](https://github.com/rapidsai/rmm): Source code and development +- [RMM Issue Tracker](https://github.com/rapidsai/rmm/issues): Report bugs or request features +- [RAPIDS Documentation](https://docs.rapids.ai): RAPIDS ecosystem docs +- [RAPIDS Installation Guide](https://docs.rapids.ai/install): Installation instructions +- [Developer Blog: Fast, Flexible Allocation](https://developer.nvidia.com/blog/fast-flexible-allocation-for-cuda-with-rapids-memory-manager/): RMM design walkthrough +- [Developer Blog: Stream-Ordered Allocation](https://developer.nvidia.com/blog/using-cuda-stream-ordered-memory-allocator-part-1/): Deep dive into stream-ordered semantics diff --git a/docs/user_guide/logging.md b/docs/user_guide/logging.md new file mode 100644 index 000000000..d698678fc --- /dev/null +++ b/docs/user_guide/logging.md @@ -0,0 +1,355 @@ +# Logging and Profiling + +RMM provides adaptors for tracking memory allocations and deallocations. + +The {cpp:class}`~rmm::mr::logging_resource_adaptor` / {py:class}`~rmm.mr.LoggingResourceAdaptor` will produce a CSV file of all allocations/deallocations with timestamps and stream IDs. + +The {cpp:class}`~rmm::mr::statistics_resource_adaptor` / {py:class}`~rmm.mr.StatisticsResourceAdaptor`, and {py:mod}`rmm.statistics`, can be used to track allocation statistics such as peak memory and total memory. + +## Memory Event Logging + +Memory event logging writes details of every allocation and deallocation to a CSV file. This is useful for: +- Debugging memory issues +- Understanding allocation patterns +- Profiling memory usage +- Replaying workloads for benchmarking + +### Using the Logging Adaptor + +Wrap any memory resource with the logging adaptor to record allocations and deallocations to a CSV file: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [logging-adaptor]" +end-before: "// [/logging-adaptor]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [logging-adaptor]" +end-before: "# [/logging-adaptor]" +dedent: +--- +``` +```` +````` + +If no filename is provided, the `RMM_LOG_FILE` environment variable is used: + +```bash +export RMM_LOG_FILE="allocations.csv" +``` + +### CSV Log Format + +Each row represents an allocation or deallocation with the following columns: + +``` +Thread,Time,Action,Pointer,Size,Stream +``` + +Example: +``` +Thread,Time,Action,Pointer,Size,Stream +140573312345856,1634567890.123456,allocate,0x7f8a40000000,1024,0x7f8a38001020 +140573312345856,1634567890.234567,allocate,0x7f8a40000400,2048,0x7f8a38001020 +140573312345856,1634567890.345678,deallocate,0x7f8a40000000,1024,0x7f8a38001020 +``` + +- **Thread**: Thread ID performing the operation +- **Time**: Timestamp (seconds since epoch) +- **Action**: `allocate` or `deallocate` +- **Pointer**: Memory address +- **Size**: Allocation size in bytes +- **Stream**: CUDA stream pointer + +### Analyzing Logs + +You can parse and analyze logs with Python: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [analyzing-logs]" +end-before: "# [/analyzing-logs]" +dedent: +--- +``` + +### Replay Benchmark + +When building RMM from source, logs can be used with `REPLAY_BENCHMARK`: + +```bash +cd build/gbenchmarks +./REPLAY_BENCHMARK --log_file=memory_log.csv +``` + +This replays the allocation pattern from the log, useful for: +- Benchmarking different memory resources +- Testing allocator implementations +- Profiling allocation overhead + +## Memory Statistics + +RMM provides statistics tracking for allocations using `statistics_resource_adaptor`. The adaptor tracks current, peak, and total allocation bytes and counts. + +### Using the Statistics Adaptor + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [statistics-adaptor]" +end-before: "// [/statistics-adaptor]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [statistics-adaptor]" +end-before: "# [/statistics-adaptor]" +dedent: +--- +``` +```` +````` + +Python also provides a convenience API for enabling statistics globally: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [statistics-global]" +end-before: "# [/statistics-global]" +dedent: +--- +``` + +### Tracking Memory Growth + +Monitor memory usage over time: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [tracking-memory-growth]" +end-before: "# [/tracking-memory-growth]" +dedent: +--- +``` + +## Memory Profiling (Python) + +The memory profiler tracks allocations by function/code block. + +### Profiling Functions + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [profiling-functions]" +end-before: "# [/profiling-functions]" +dedent: +--- +``` + +The report shows the number of calls, peak memory, and total memory for each profiled function. + +### Profiling Code Blocks + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [profiling-code-blocks]" +end-before: "# [/profiling-code-blocks]" +dedent: +--- +``` + +### Nested Profiling + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [nested-profiling]" +end-before: "# [/nested-profiling]" +dedent: +--- +``` + +The report includes entries for both the outer and inner profiling scopes. + +### Custom Profiler Records + +Use custom profiler records for separate tracking: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [custom-profiler-records]" +end-before: "# [/custom-profiler-records]" +dedent: +--- +``` + +## Debug Logging + +RMM uses [rapids-logger](https://github.com/rapidsai/rapids-logger) for debug output. + +### Enabling Debug Logging + +Debug logs show internal RMM behavior, errors, and warnings. + +#### Output Location + +By default, logs go to stderr. Set `RMM_DEBUG_LOG_FILE` to write to a file: + +```bash +export RMM_DEBUG_LOG_FILE=/path/to/rmm_debug.log +``` + +#### Log Levels + +Set at **compile time** with CMake: + +```bash +cmake .. -DRMM_LOGGING_LEVEL=DEBUG +``` + +Available levels (increasing verbosity): +- `OFF` - No logging +- `CRITICAL` - Only critical errors +- `ERROR` - Errors +- `WARN` - Warnings and errors +- `INFO` - Informational messages (default) +- `DEBUG` - Detailed debug info +- `TRACE` - Very verbose tracing + +#### Runtime Log Level + +Even with verbose logging compiled in, you must enable it at runtime: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [debug-log-level]" +end-before: "// [/debug-log-level]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [debug-log-level]" +end-before: "# [/debug-log-level]" +dedent: +--- +``` +```` +````` + +### What Gets Logged + +Debug logging shows: +- Memory resource initialization +- Allocation failures and errors +- Pool growth and shrinkage +- Stream synchronization events +- Multi-device operations +- Internal state changes + +Example debug output: +``` +[2024-01-15 10:30:45.123] [info] Initializing cuda_async_memory_resource +[2024-01-15 10:30:45.234] [debug] pool_memory_resource: allocated 1 GiB from upstream +[2024-01-15 10:30:45.345] [warn] Allocation of 10 GiB failed, pool exhausted +[2024-01-15 10:30:45.456] [debug] Growing pool by 2 GiB +``` + +## Combining Logging Features + +Multiple logging features can be composed together by stacking adaptors: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/logging.cpp +--- +language: cpp +start-after: "// [combining-features]" +end-before: "// [/combining-features]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [combining-features]" +end-before: "# [/combining-features]" +dedent: +--- +``` +```` +````` + +## Use Cases + +### Debugging OOM Errors + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [debugging-oom]" +end-before: "# [/debugging-oom]" +dedent: +--- +``` + +### Profiling Memory in Data Pipeline + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [profiling-pipeline]" +end-before: "# [/profiling-pipeline]" +dedent: +--- +``` + +### Benchmarking Memory Resources + +```{literalinclude} ../../python/rmm/rmm/tests/examples/logging.py +--- +language: python +start-after: "# [benchmarking-resources]" +end-before: "# [/benchmarking-resources]" +dedent: +--- +``` + +## Best Practices + +1. **Use event logging for debugging** - CSV logs help understand allocation patterns +2. **Enable statistics for profiling** - Track memory usage over time +3. **Use profiler for hotspot analysis** - Identify which functions allocate most memory +4. **Set appropriate debug level** - Use `INFO` normally, `DEBUG`/`TRACE` when troubleshooting +5. **Disable logging in production** - Logging has overhead; only enable when needed +6. **Analyze logs with tools** - Use pandas, REPLAY_BENCHMARK, or custom scripts +7. **Combine with NVIDIA tools** - Use [NVIDIA Nsight™ Systems](https://developer.nvidia.com/nsight-systems) alongside RMM logging for a complete picture diff --git a/docs/user_guide/managed_memory.md b/docs/user_guide/managed_memory.md new file mode 100644 index 000000000..ec5fb4be8 --- /dev/null +++ b/docs/user_guide/managed_memory.md @@ -0,0 +1,100 @@ +# Managed Memory and Prefetching + +CUDA Managed Memory (also called Unified Memory) provides a single address space accessible from both CPU and GPU. The CUDA driver migrates pages between host and device memory on demand, which means you can work with datasets larger than GPU memory or share data between host and device code without explicit copies. + +RMM's {cpp:class}`~rmm::mr::managed_memory_resource` (C++) / {py:class}`~rmm.mr.ManagedMemoryResource` (Python) allocates managed memory via `cudaMallocManaged`. For background on how Unified Memory works at the driver level, see the [CUDA Programming Guide: Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#unified-memory-programming). + +The main trade-off is performance: on-demand page migration introduces latency from page faults. For production workloads, combining managed memory with prefetching (described below) is essential to avoid this overhead. + +## Prefetching + +Without prefetching, the first GPU access to a managed allocation triggers a page fault that stalls execution while the driver migrates data from host memory. If the working set exceeds GPU memory, pages get evicted and re-faulted repeatedly, which can degrade performance severely. The [CUDA Programming Guide](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#performance-tuning) covers page fault behavior and tuning in detail. + +Prefetching migrates data to the GPU ahead of time so that kernels find it already resident. RMM supports two approaches. + +### Prefetch on Allocate (Eager) + +{cpp:class}`~rmm::mr::prefetch_resource_adaptor` (C++) / {py:class}`~rmm.mr.PrefetchResourceAdaptor` (Python) wraps another resource and prefetches each allocation to the current device as soon as it's made. This works well when data is used on the GPU shortly after allocation, such as when copying or writing to the new allocation: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/managed_memory.py +--- +language: python +start-after: "# [prefetch-on-allocate]" +end-before: "# [/prefetch-on-allocate]" +dedent: +--- +``` + +Adding a pool between the managed resource and the prefetch adaptor avoids calling `cudaMallocManaged` on every allocation. The pool grabs large chunks of managed memory upfront, and the prefetch adaptor ensures each suballocation is migrated to the GPU before use. Non-allocating adaptors like logging or statistics can safely wrap the prefetch adaptor on the outside: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/managed_memory.py +--- +language: python +start-after: "# [prefetch-with-pool]" +end-before: "# [/prefetch-with-pool]" +dedent: +--- +``` + +### Prefetch on Access (Lazy) + +When you need control over exactly when data moves to the GPU — for instance because the allocation happens long before the kernel that consumes it — you can prefetch manually: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/managed_memory.cu +--- +language: cuda +start-after: "// [prefetch-on-access]" +end-before: "// [/prefetch-on-access]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/managed_memory.py +--- +language: python +start-after: "# [prefetch-on-access]" +end-before: "# [/prefetch-on-access]" +dedent: +--- +``` +```` +````` + +## Prefetching Best Practices + +### Stream ordering + +When prefetching manually, issue the prefetch on the same stream as the kernel that will consume the data. This guarantees the migration completes before the kernel launches. + +### Profiling + +[NVIDIA Nsight Systems](https://developer.nvidia.com/nsight-systems) can visualize page faults and data migration to help you decide where prefetching is needed: + +```bash +nsys profile -o output python your_script.py +``` + +When using `compute-sanitizer` with managed memory, enable page fault tracking: + +```bash +compute-sanitizer --tool memcheck \ + --cuda-um-cpu-page-faults=true \ + --cuda-um-gpu-page-faults=true \ + python your_script.py +``` + +## Limitations + +- **Not stream-ordered**: `ManagedMemoryResource` uses `cudaMallocManaged`, which is synchronous — the call blocks until the allocation is complete. For multi-stream applications where allocation latency matters, prefer `CudaAsyncMemoryResource`. +- **Migration overhead**: Even with prefetching, managed memory carries overhead from driver-managed page migration. If your data fits comfortably in GPU memory, `CudaAsyncMemoryResource` avoids this cost entirely. +- **Interconnect bandwidth**: Workloads that constantly migrate data between CPU and GPU are bounded by the throughput of the CPU-GPU interconnect (PCIe, NVLink-C2C, etc.). + +## See Also + +- [Choosing a Memory Resource](choosing_memory_resources.md) - When to use managed memory vs. other resources +- [Stream-Ordered Allocation](stream_ordered_allocation.md) - Understanding asynchronous allocation semantics +- [CUDA Programming Guide: Unified Memory](https://docs.nvidia.com/cuda/cuda-c-programming-guide/#unified-memory-programming) +- [NVIDIA Developer Blog: Unified Memory](https://developer.nvidia.com/blog/unified-memory-cuda-beginners/) diff --git a/docs/user_guide/stream_ordered_allocation.md b/docs/user_guide/stream_ordered_allocation.md new file mode 100644 index 000000000..94b90c169 --- /dev/null +++ b/docs/user_guide/stream_ordered_allocation.md @@ -0,0 +1,141 @@ +# Stream-Ordered Memory Allocation + +RMM containers ({cpp:class}`~rmm::device_buffer`, {py:class}`~rmm.DeviceBuffer`) and [memory resources](../python/mr.md) are stream-ordered: allocations and deallocations are enqueued on a CUDA stream rather than blocking the CPU. This lets memory operations overlap with kernel execution and avoids the synchronization cost of `cudaMalloc`/`cudaFree`. For background on CUDA streams and asynchronous execution, see the [CUDA Programming Guide: Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/asynchronous-execution.html#what-is-asynchronous-concurrent-execution). + +## How It Works + +When you allocate from a stream-ordered resource, the call returns a pointer immediately. The pointer value is available on the CPU right away — you can store it, pass it to kernel launch arguments, or hand it to another API. The memory backing behind the pointer becomes available for GPU operations enqueued on the same stream after the allocation: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [how-it-works]" +end-before: "// [/how-it-works]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [how-it-works]" +end-before: "# [/how-it-works]" +dedent: +--- +``` +```` +````` + +Deallocations are also stream-ordered: when a buffer is destroyed, the deallocation is enqueued on the stream, so the memory is not actually freed until all prior work on that stream completes. + +## When to Synchronize + +### Reading results on the host + +The pointer returned by a stream-ordered allocation is a CPU value — you can store it or pass it to other APIs without synchronization. However, the stream must be synchronized before the CPU reads data that was written by GPU operations on that stream. The most common case is a device-to-host copy followed by a sync: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [reading-results]" +end-before: "// [/reading-results]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [reading-results]" +end-before: "# [/reading-results]" +dedent: +--- +``` +```` +````` + +### Cross-stream usage + +Memory allocated on one stream can only be safely used on a different stream after the allocation is known to have completed. The simplest approach is to synchronize the allocating stream, but that stalls the CPU. A lighter-weight alternative is to record a CUDA event on the allocating stream and have the consuming stream wait on it: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [cross-stream]" +end-before: "// [/cross-stream]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [cross-stream]" +end-before: "# [/cross-stream]" +dedent: +--- +``` +```` +````` + +### Buffer lifetime across streams + +If a buffer is allocated and used on the same stream, deallocation is safe — stream ordering guarantees prior work completes first. The problem arises when a buffer is used on a *different* stream from the one it will be deallocated on. In that case, you need to ensure the consuming stream's work finishes before the buffer is destroyed. The same event pattern works here — record an event on the consuming stream and have the deallocating stream wait on it: + +`````{tabs} +````{group-tab} C++ +```{literalinclude} ../../cpp/examples/docs/src/stream_ordered_allocation.cu +--- +language: cuda +start-after: "// [buffer-lifetime]" +end-before: "// [/buffer-lifetime]" +dedent: +--- +``` +```` +````{group-tab} Python +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [buffer-lifetime]" +end-before: "# [/buffer-lifetime]" +dedent: +--- +``` +```` +````` + +## Which Resources Support Stream Ordering? + +- **{py:class}`~rmm.mr.CudaAsyncMemoryResource`**: Fully stream-ordered (recommended) +- **{py:class}`~rmm.mr.PoolMemoryResource`**: Internally stream-safe — suballocations are mutex-protected, independent of upstream +- **{py:class}`~rmm.mr.ArenaMemoryResource`**: Internally stream-safe — uses per-stream arenas, independent of upstream +- **{py:class}`~rmm.mr.CudaMemoryResource`**: NOT stream-ordered (`cudaMalloc` is synchronous) +- **{py:class}`~rmm.mr.ManagedMemoryResource`**: NOT stream-ordered (`cudaMallocManaged` is synchronous) + +## Example: Numba Kernel with RMM Stream + +This example allocates an RMM buffer and launches a Numba kernel on the same stream, so the allocation is guaranteed to complete before the kernel accesses the memory: + +```{literalinclude} ../../python/rmm/rmm/tests/examples/stream_ordered_allocation.py +--- +language: python +start-after: "# [numba-stream]" +end-before: "# [/numba-stream]" +dedent: +--- +``` + +## See Also + +- [Choosing a Memory Resource](choosing_memory_resources.md) - Which resources support stream ordering +- [CUDA Programming Guide: Asynchronous Concurrent Execution](https://docs.nvidia.com/cuda/cuda-programming-guide/02-basics/asynchronous-execution.html#what-is-asynchronous-concurrent-execution) diff --git a/python/rmm/rmm/tests/examples/choosing_memory_resources.py b/python/rmm/rmm/tests/examples/choosing_memory_resources.py new file mode 100644 index 000000000..1403b6281 --- /dev/null +++ b/python/rmm/rmm/tests/examples/choosing_memory_resources.py @@ -0,0 +1,172 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/choosing_memory_resources.md +# ruff: noqa: RUF059 + + +def recommended_default() -> None: + # [recommended-default] + import rmm + + mr = rmm.mr.CudaAsyncMemoryResource() + buffer = rmm.DeviceBuffer(size=1024, mr=mr) + # [/recommended-default] + + assert buffer.size == 1024 + + +def managed_pool_prefetch() -> None: + # [managed-pool-prefetch] + import rmm + + # Use 80% of GPU memory, rounded down to nearest 256 bytes + free_memory, total_memory = rmm.mr.available_device_memory() + pool_size = int(total_memory * 0.8) // 256 * 256 + + mr = rmm.mr.PrefetchResourceAdaptor( + rmm.mr.PoolMemoryResource( + rmm.mr.ManagedMemoryResource(), + initial_pool_size=pool_size, + ) + ) + # [/managed-pool-prefetch] + + buffer = rmm.DeviceBuffer(size=1024, mr=mr) + assert buffer.size == 1024 + + +def managed_memory_example() -> None: + # [managed-memory-example] + import rmm + + # Combine managed memory with a pool and prefetching for performance. + # Without prefetching, page faults cause significant overhead. + base = rmm.mr.ManagedMemoryResource() + pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool) + buffer = rmm.DeviceBuffer(size=1024, mr=prefetch_mr) + # [/managed-memory-example] + + assert buffer.size == 1024 + + +def pool_memory_example() -> None: + # [pool-memory-example] + import rmm + + pool = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), + initial_pool_size=2**32, # 4 GiB + maximum_pool_size=2**34, # 16 GiB + ) + buffer = rmm.DeviceBuffer(size=1024, mr=pool) + # [/pool-memory-example] + + assert buffer.size == 1024 + + +def composing_adaptor() -> None: + # [composing-adaptor] + # Adaptor wrapping a base resource + import rmm + + adaptor = rmm.mr.StatisticsResourceAdaptor( + rmm.mr.CudaAsyncMemoryResource() + ) + # [/composing-adaptor] + + _ = adaptor + + +def prefetch_composition() -> None: + # [prefetch-composition] + import rmm + + # Prefetch adaptor wrapping managed memory pool + base = rmm.mr.ManagedMemoryResource() + pool = rmm.mr.PoolMemoryResource(base, initial_pool_size=2**30) + prefetch = rmm.mr.PrefetchResourceAdaptor(pool) + buffer = rmm.DeviceBuffer(size=1024, mr=prefetch) + # [/prefetch-composition] + + assert buffer.size == 1024 + + +def statistics_composition() -> None: + # [statistics-composition] + import rmm + + # Track allocation statistics (counts, peak, and total bytes) + base = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(base) + buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) + # [/statistics-composition] + + assert buffer.size == 1024 + + +def logging_composition() -> None: + # [logging-composition] + import rmm + + # Log every allocation and deallocation to a file + base = rmm.mr.CudaAsyncMemoryResource() + logging_mr = rmm.mr.LoggingResourceAdaptor( + base, log_file_name="allocations.csv" + ) + buffer = rmm.DeviceBuffer(size=1024, mr=logging_mr) + # [/logging-composition] + + assert buffer.size == 1024 + + import os + + if os.path.exists("allocations.csv"): + os.remove("allocations.csv") + + +def multi_library_pytorch() -> None: + try: + import torch + except ImportError: + print("PyTorch not available, skipping multi_library_pytorch") + return + + # isort: off + # [multi-library-pytorch] + import rmm + import torch + from rmm.allocators.torch import rmm_torch_allocator + + # Configure RMM + rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + + # Configure PyTorch to allocate through RMM + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + # [/multi-library-pytorch] + # isort: on + + +def best_practices_set_early() -> None: + # [best-practices-set-early] + import rmm + + # Do this first, before any GPU allocations + rmm.mr.set_current_device_resource(rmm.mr.CudaAsyncMemoryResource()) + # [/best-practices-set-early] + + +if __name__ == "__main__": + recommended_default() + managed_pool_prefetch() + managed_memory_example() + pool_memory_example() + composing_adaptor() + prefetch_composition() + statistics_composition() + logging_composition() + multi_library_pytorch() + best_practices_set_early() + + print("All choosing_memory_resources examples passed.") diff --git a/python/rmm/rmm/tests/examples/guide.py b/python/rmm/rmm/tests/examples/guide.py new file mode 100644 index 000000000..1cc6ef0c4 --- /dev/null +++ b/python/rmm/rmm/tests/examples/guide.py @@ -0,0 +1,257 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/guide.md + + +def explicit_resource() -> None: + # [explicit-resource] + import rmm + + mr = rmm.mr.CudaAsyncMemoryResource() + + # Pass the resource explicitly + buffer = rmm.DeviceBuffer(size=1024, mr=mr) + # [/explicit-resource] + + assert buffer.size == 1024 + + +def current_device_resource() -> None: + # [current-device-resource] + import rmm + + async_mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(async_mr) + + # Allocations that don't specify a resource use the current device resource + mr = rmm.mr.get_current_device_resource() + # [/current-device-resource] + + assert mr is not None + + +def device_buffer_example() -> None: + # [device-buffer] + import rmm + + # Allocate 1024 bytes + buffer = rmm.DeviceBuffer(size=1024) + + # Access pointer and size + ptr = buffer.ptr + size = buffer.size + + # Resize (may reallocate) + buffer.resize(2048) + + # Copy construct (deep copy) + buffer2 = buffer.copy() + # [/device-buffer] + + assert buffer.size == 2048 + assert buffer2.size == 2048 + _ = ptr, size + + +def statistics_tracking() -> None: + # [statistics-tracking] + import rmm + + # Wrap base resource with statistics adaptor + cuda_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + + # Allocate using the statistics-wrapped resource + buffer = rmm.DeviceBuffer(size=1024, mr=stats_mr) + + # Get statistics + stats = stats_mr.allocation_counts + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total bytes: {stats.total_bytes}") + # [/statistics-tracking] + + assert stats.current_bytes >= 1024 + _ = buffer + + +def logging_example() -> None: + # [logging] + import rmm + + base_mr = rmm.mr.CudaAsyncMemoryResource() + log_mr = rmm.mr.LoggingResourceAdaptor( + base_mr, log_file_name="allocations.csv" + ) + + # Allocations through log_mr are logged to CSV + buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) + # [/logging] + + assert buffer.size == 1024 + + import os + + if os.path.exists("allocations.csv"): + os.remove("allocations.csv") + + +def composing_resources() -> None: + # [composing-resources] + import rmm + + # Base resource + cuda_mr = rmm.mr.CudaMemoryResource() + + # Add pool + pool_mr = rmm.mr.PoolMemoryResource(cuda_mr, initial_pool_size=2**30) + + # Add statistics + stats_mr = rmm.mr.StatisticsResourceAdaptor(pool_mr) + + # Add logging + log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") + + # Use log_mr for allocations — all allocations are pooled, tracked, and logged + buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) + # [/composing-resources] + + assert buffer.size == 1024 + + import os + + if os.path.exists("log.csv"): + os.remove("log.csv") + + +def cupy_example() -> None: + try: + import cupy as cp + except ImportError: + print("CuPy not available, skipping cupy_example") + return + + # isort: off + # [cupy] + import rmm + import cupy as cp + from rmm.allocators.cupy import rmm_cupy_allocator + + # Configure RMM + mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Set CuPy to use RMM + cp.cuda.set_allocator(rmm_cupy_allocator) + + # All CuPy arrays now use RMM + array = cp.zeros(1000) + # [/cupy] + # isort: on + + assert array.shape == (1000,) + + +def numba_example() -> None: + try: + from numba import cuda + except ImportError: + print("Numba not available, skipping numba_example") + return + + # isort: off + # [numba] + from numba import cuda + from rmm.allocators.numba import RMMNumbaManager + import rmm + + # Configure RMM + mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Set Numba to use RMM + cuda.set_memory_manager(RMMNumbaManager) + # [/numba] + # isort: on + + +def pytorch_example() -> None: + try: + import torch + except ImportError: + print("PyTorch not available, skipping pytorch_example") + return + + # isort: off + # [pytorch] + import rmm + import torch + from rmm.allocators.torch import rmm_torch_allocator + + # Configure RMM + mr = rmm.mr.CudaAsyncMemoryResource() + rmm.mr.set_current_device_resource(mr) + + # Set PyTorch to use RMM + torch.cuda.memory.change_current_allocator(rmm_torch_allocator) + + # All PyTorch tensors now use RMM + tensor = torch.zeros(1000, device="cuda") + # [/pytorch] + # isort: on + + assert tensor.shape == (1000,) + + +def multi_device_example() -> None: + try: + from cuda.bindings import runtime + except ImportError: + print("cuda.bindings not available, skipping multi_device_example") + return + + _, num_devices = runtime.cudaGetDeviceCount() + if num_devices < 1: + print("No CUDA devices, skipping multi_device_example") + return + + # isort: off + # [multi-device] + import rmm + from cuda.bindings import runtime + + _, num_devices = runtime.cudaGetDeviceCount() + + # Store resources to maintain lifetime + resources = [] + + for device_id in range(num_devices): + # Create resource for this device + mr = rmm.mr.CudaAsyncMemoryResource() + resources.append(mr) + + # Set as per-device resource + rmm.mr.set_per_device_resource(device_id, mr) + + # Use device 0 + buffer = rmm.DeviceBuffer(size=1024) # Uses device 0's resource + # [/multi-device] + # isort: on + + assert buffer.size == 1024 + + +if __name__ == "__main__": + explicit_resource() + current_device_resource() + device_buffer_example() + statistics_tracking() + logging_example() + composing_resources() + cupy_example() + numba_example() + pytorch_example() + multi_device_example() + + print("All guide examples passed.") diff --git a/python/rmm/rmm/tests/examples/installation.py b/python/rmm/rmm/tests/examples/installation.py new file mode 100644 index 000000000..37a1854b8 --- /dev/null +++ b/python/rmm/rmm/tests/examples/installation.py @@ -0,0 +1,24 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/installation.md + + +def test_installation() -> None: + # [test-installation] + import rmm + + print(rmm.__version__) + + # Quick test + buffer = rmm.DeviceBuffer(size=100) + print(f"Allocated {buffer.size} bytes") + # [/test-installation] + + assert buffer.size == 100 + + +if __name__ == "__main__": + test_installation() + + print("All installation examples passed.") diff --git a/python/rmm/rmm/tests/examples/introduction.py b/python/rmm/rmm/tests/examples/introduction.py new file mode 100644 index 000000000..52cd5850f --- /dev/null +++ b/python/rmm/rmm/tests/examples/introduction.py @@ -0,0 +1,13 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/introduction.md + +# [basic-example] +import rmm + +mr = rmm.mr.CudaAsyncMemoryResource() +buffer = rmm.DeviceBuffer(size=1024, mr=mr) +# [/basic-example] + +assert buffer.size == 1024 diff --git a/python/rmm/rmm/tests/examples/logging.py b/python/rmm/rmm/tests/examples/logging.py new file mode 100644 index 000000000..5fab3d6f1 --- /dev/null +++ b/python/rmm/rmm/tests/examples/logging.py @@ -0,0 +1,387 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/logging.md + +import os + + +def logging_adaptor() -> None: + # [logging-adaptor] + import rmm + + base_mr = rmm.mr.CudaAsyncMemoryResource() + log_mr = rmm.mr.LoggingResourceAdaptor( + base_mr, log_file_name="memory_log.csv" + ) + + # Allocations through log_mr are logged to CSV + buf1 = rmm.DeviceBuffer(size=1024, mr=log_mr) + buf2 = rmm.DeviceBuffer(size=2048, mr=log_mr) + # [/logging-adaptor] + + assert buf1.size == 1024 + assert buf2.size == 2048 + + if os.path.exists("memory_log.csv"): + os.remove("memory_log.csv") + + +def statistics_adaptor() -> None: + # [statistics-adaptor] + import rmm + + cuda_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + + # Allocate using the statistics-wrapped resource + buf1 = rmm.DeviceBuffer(size=1024, mr=stats_mr) + buf2 = rmm.DeviceBuffer(size=2048, mr=stats_mr) + + # Get statistics + stats = stats_mr.allocation_counts + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total allocations: {stats.total_count}") + # [/statistics-adaptor] + + assert stats.current_bytes >= 1024 + _ = buf1, buf2 + + +def statistics_global() -> None: + # [statistics-global] + import rmm + + # Enable statistics globally + rmm.statistics.enable_statistics() + + # Or use context manager for specific code blocks + with rmm.statistics.statistics(): + buffer = rmm.DeviceBuffer(size=1024) + + stats = rmm.statistics.get_statistics() + assert stats is not None + print(f"Current bytes: {stats.current_bytes}") + print(f"Peak bytes: {stats.peak_bytes}") + print(f"Total allocations: {stats.total_count}") + # [/statistics-global] + + _ = buffer + + +def tracking_memory_growth() -> None: + # [tracking-memory-growth] + import rmm + + rmm.statistics.enable_statistics() + + def checkpoint(label) -> None: + stats = rmm.statistics.get_statistics() + assert stats is not None + print(f"{label}:") + print( + f" Current: {stats.current_bytes:,} bytes ({stats.current_count} allocations)" + ) + print(f" Peak: {stats.peak_bytes:,} bytes") + + checkpoint("Start") + + # Allocate + buffers = [rmm.DeviceBuffer(size=1024 * 1024) for _ in range(10)] + checkpoint("After 10x1MB allocations") + + # Free some + buffers = buffers[:5] + checkpoint("After freeing 5") + + # Allocate more + buffers.extend([rmm.DeviceBuffer(size=2 * 1024 * 1024) for _ in range(5)]) + checkpoint("After 5x2MB allocations") + # [/tracking-memory-growth] + + +def profiling_functions() -> None: + # [profiling-functions] + import rmm + + # Enable statistics first + rmm.statistics.enable_statistics() + + # Profile a function + @rmm.statistics.profiler() + def process_data(size): + buffer = rmm.DeviceBuffer(size=size) + # ... processing ... + return buffer + + # Run function + process_data(1000000) + + # View report + print(rmm.statistics.default_profiler_records.report()) + # [/profiling-functions] + + +def profiling_code_blocks() -> None: + # [profiling-code-blocks] + import rmm + + rmm.statistics.enable_statistics() + + # Profile specific code blocks + with rmm.statistics.profiler(name="data loading"): + data = rmm.DeviceBuffer(size=1000000) + + with rmm.statistics.profiler(name="processing"): + buffer1 = rmm.DeviceBuffer(size=500000) + buffer2 = rmm.DeviceBuffer(size=500000) + + # View report + print(rmm.statistics.default_profiler_records.report()) + # [/profiling-code-blocks] + + _ = data, buffer1, buffer2 + + +def nested_profiling() -> None: + # [nested-profiling] + import rmm + + rmm.statistics.enable_statistics() + + with rmm.statistics.profiler(name="outer"): + buffer1 = rmm.DeviceBuffer(size=1000) + + with rmm.statistics.profiler(name="inner"): + buffer2 = rmm.DeviceBuffer(size=2000) + + buffer3 = rmm.DeviceBuffer(size=500) + + print(rmm.statistics.default_profiler_records.report()) + # [/nested-profiling] + + _ = buffer1, buffer2, buffer3 + + +def custom_profiler_records() -> None: + # [custom-profiler-records] + import rmm + + rmm.statistics.enable_statistics() + + # Create custom profiler records + custom_records = rmm.statistics.ProfilerRecords() + + # Use with context manager + with rmm.statistics.profiler(name="my operation", records=custom_records): + buffer = rmm.DeviceBuffer(size=1024) + + # View only custom records + print(custom_records.report()) + # [/custom-profiler-records] + + _ = buffer + + +def debug_log_level() -> None: + # [debug-log-level] + import rmm + + # Available levels: trace, debug, info, warn, error, critical, off + rmm.set_logging_level(rmm.level_enum.trace) + # [/debug-log-level] + + # Reset to default + rmm.set_logging_level(rmm.level_enum.info) + + +def combining_features() -> None: + # [combining-features] + import rmm + + # Set debug log level + rmm.set_logging_level(rmm.level_enum.debug) + + # Build resource stack: statistics + logging + cuda_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(cuda_mr) + log_mr = rmm.mr.LoggingResourceAdaptor( + stats_mr, log_file_name="events.csv" + ) + + # All allocations through log_mr are tracked and logged + buffer = rmm.DeviceBuffer(size=1024, mr=log_mr) + + # Get statistics + stats = stats_mr.allocation_counts + print(f"Peak bytes: {stats.peak_bytes}") + + # Profiling can also be used alongside event logging + rmm.statistics.enable_statistics() + + @rmm.statistics.profiler() + def my_function(): + return rmm.DeviceBuffer(size=1024, mr=log_mr) + + my_function() + print(rmm.statistics.default_profiler_records.report()) + # [/combining-features] + + # Reset to default + rmm.set_logging_level(rmm.level_enum.info) + _ = buffer + + if os.path.exists("events.csv"): + os.remove("events.csv") + + +def debugging_oom() -> None: + # [debugging-oom] + import rmm + + # Enable detailed logging + base_mr = rmm.mr.CudaAsyncMemoryResource() + stats_mr = rmm.mr.StatisticsResourceAdaptor(base_mr) + log_mr = rmm.mr.LoggingResourceAdaptor( + stats_mr, log_file_name="oom_debug.csv" + ) + rmm.set_logging_level(rmm.level_enum.debug) + + # Run problematic code + try: + large_buffer = rmm.DeviceBuffer(size=100 * 2**30, mr=log_mr) # noqa: F841 + except MemoryError: + stats = stats_mr.allocation_counts + print(f"Peak before OOM: {stats.peak_bytes / 2**30:.2f} GiB") + print("Check oom_debug.csv for allocation history") + raise + # [/debugging-oom] + + +def profiling_pipeline() -> None: + # [profiling-pipeline] + import rmm + + rmm.statistics.enable_statistics() + + @rmm.statistics.profiler() + def load_data(): + return rmm.DeviceBuffer(size=1000000) + + @rmm.statistics.profiler() + def process_data(buffer): + temp = rmm.DeviceBuffer(size=2000000) # noqa: F841 + result = rmm.DeviceBuffer(size=500000) + return result + + @rmm.statistics.profiler() + def save_data(buffer): + pass + + # Run pipeline + data = load_data() + result = process_data(data) + save_data(result) + + # Identify memory hotspots + print(rmm.statistics.default_profiler_records.report()) + # [/profiling-pipeline] + + +def benchmarking_resources() -> None: + # isort: off + # [benchmarking-resources] + import rmm + import time + + def benchmark_allocations(mr_name, mr) -> None: + start = time.time() + buffers = [] + for _ in range(1000): + buffers.append(rmm.DeviceBuffer(size=1024, mr=mr)) + end = time.time() + + print(f"{mr_name}: {(end - start) * 1000:.2f} ms for 1000 allocations") + + # Compare resources + benchmark_allocations("CudaMemoryResource", rmm.mr.CudaMemoryResource()) + benchmark_allocations( + "CudaAsyncMemoryResource", rmm.mr.CudaAsyncMemoryResource() + ) + pool_mr = rmm.mr.PoolMemoryResource( + rmm.mr.CudaMemoryResource(), initial_pool_size=2**20 + ) + benchmark_allocations("PoolMemoryResource", pool_mr) + # [/benchmarking-resources] + # isort: on + + +def analyzing_logs() -> None: + # Generate a sample log file for the example + import rmm + + base_mr = rmm.mr.CudaAsyncMemoryResource() + log_mr = rmm.mr.LoggingResourceAdaptor( + base_mr, log_file_name="memory_log.csv" + ) + buf1 = rmm.DeviceBuffer(size=1024, mr=log_mr) + buf2 = rmm.DeviceBuffer(size=2048, mr=log_mr) + del buf1 + + try: + import pandas as pd # type: ignore[import-untyped] + except ImportError: + print("pandas not available, skipping analyzing_logs") + if os.path.exists("memory_log.csv"): + os.remove("memory_log.csv") + return + + # [analyzing-logs] + import pandas as pd + + # Read log file + df = pd.read_csv("memory_log.csv") + + # Total bytes allocated + total_allocated = df[df["Action"] == "allocate"]["Size"].sum() + print(f"Total allocated: {total_allocated:,} bytes") + + # Allocation size distribution + print(df[df["Action"] == "allocate"]["Size"].describe()) + + # Peak memory usage (simple analysis) + df["Delta"] = df.apply( + lambda row: row["Size"] + if row["Action"] == "allocate" + else -row["Size"], + axis=1, + ) + df["Cumulative"] = df["Delta"].cumsum() + peak = df["Cumulative"].max() + print(f"Peak usage: {peak:,} bytes") + # [/analyzing-logs] + + _ = buf2 + + if os.path.exists("memory_log.csv"): + os.remove("memory_log.csv") + + +if __name__ == "__main__": + logging_adaptor() + statistics_adaptor() + statistics_global() + tracking_memory_growth() + profiling_functions() + profiling_code_blocks() + nested_profiling() + custom_profiler_records() + debug_log_level() + combining_features() + # debugging_oom() — intentionally skipped (raises MemoryError) + profiling_pipeline() + benchmarking_resources() + analyzing_logs() + + print("All logging examples passed.") diff --git a/python/rmm/rmm/tests/examples/managed_memory.py b/python/rmm/rmm/tests/examples/managed_memory.py new file mode 100644 index 000000000..642cd390d --- /dev/null +++ b/python/rmm/rmm/tests/examples/managed_memory.py @@ -0,0 +1,65 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/managed_memory.md + +import os + + +def prefetch_on_allocate() -> None: + # [prefetch-on-allocate] + import rmm + + managed_mr = rmm.mr.ManagedMemoryResource() + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(managed_mr) + + # This allocation is prefetched to the GPU automatically + buffer = rmm.DeviceBuffer(size=1000000, mr=prefetch_mr) + # [/prefetch-on-allocate] + + assert buffer.size == 1000000 + + +def prefetch_with_pool() -> None: + # [prefetch-with-pool] + import rmm + + managed_mr = rmm.mr.ManagedMemoryResource() + pool_mr = rmm.mr.PoolMemoryResource(managed_mr, initial_pool_size=2**30) + prefetch_mr = rmm.mr.PrefetchResourceAdaptor(pool_mr) + + # Logging and statistics don't allocate, so they can go on the outside + stats_mr = rmm.mr.StatisticsResourceAdaptor(prefetch_mr) + log_mr = rmm.mr.LoggingResourceAdaptor(stats_mr, log_file_name="log.csv") + + buffer = rmm.DeviceBuffer(size=1000000, mr=log_mr) + # [/prefetch-with-pool] + + assert buffer.size == 1000000 + + if os.path.exists("log.csv"): + os.remove("log.csv") + + +def prefetch_on_access() -> None: + # [prefetch-on-access] + import rmm + from rmm.pylibrmm.stream import Stream + + managed_mr = rmm.mr.ManagedMemoryResource() + buffer = rmm.DeviceBuffer(size=1000000, mr=managed_mr) + + # Prefetch to device 0 on this stream + stream = Stream() + buffer.prefetch(device=0, stream=stream) + + # Kernel on the same stream finds the data already resident + # [/prefetch-on-access] + + +if __name__ == "__main__": + prefetch_on_allocate() + prefetch_with_pool() + prefetch_on_access() + + print("All managed_memory examples passed.") diff --git a/python/rmm/rmm/tests/examples/stream_ordered_allocation.py b/python/rmm/rmm/tests/examples/stream_ordered_allocation.py new file mode 100644 index 000000000..a7c45d7f0 --- /dev/null +++ b/python/rmm/rmm/tests/examples/stream_ordered_allocation.py @@ -0,0 +1,149 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +# Code examples for docs/user_guide/stream_ordered_allocation.md + + +def how_it_works() -> None: + # [how-it-works] + import rmm + from rmm.pylibrmm.stream import Stream + + mr = rmm.mr.CudaAsyncMemoryResource() + stream = Stream() + buffer = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) + + # buffer.ptr is usable immediately in stream-ordered operations + # [/how-it-works] + + assert buffer.size == 1000 + + +def reading_results() -> None: + # [reading-results] + import rmm + from rmm.pylibrmm.stream import Stream + + mr = rmm.mr.CudaAsyncMemoryResource() + stream = Stream() + d_buf = rmm.DeviceBuffer(size=1000, stream=stream, mr=mr) + + # ... GPU work writes to d_buf on stream ... + + # Async copy to host on the same stream, then sync before reading + h_buf = bytearray(d_buf.size) + d_buf.copy_to_host(h_buf, stream) + stream.synchronize() + # [/reading-results] + + +def cross_stream() -> None: + # isort: off + # [cross-stream] + import rmm + from rmm.pylibrmm.stream import Stream + from cuda.core import Device + + dev = Device() + dev.set_current() + + mr = rmm.mr.CudaAsyncMemoryResource() + stream_a = dev.create_stream() + stream_b = dev.create_stream() + + buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) + + # Record an event after the allocation on stream_a + alloc_event = dev.create_event(options={"enable_timing": False}) + stream_a.record(alloc_event) + + # stream_b waits for the event — no CPU synchronization needed + stream_b.wait(alloc_event) + + # Now safe to use buffer.ptr in operations on stream_b + # [/cross-stream] + # isort: on + + assert buffer.size == 1000 + + +def buffer_lifetime() -> None: + # isort: off + # [buffer-lifetime] + import rmm + from rmm.pylibrmm.stream import Stream + from cuda.core import Device + + dev = Device() + dev.set_current() + + mr = rmm.mr.CudaAsyncMemoryResource() + stream_a = dev.create_stream() + stream_b = dev.create_stream() + + buffer = rmm.DeviceBuffer(size=1000, stream=Stream(obj=stream_a), mr=mr) + + # Make stream_b wait for the allocation on stream_a + alloc_event = dev.create_event(options={"enable_timing": False}) + stream_a.record(alloc_event) + stream_b.wait(alloc_event) + + # Use buffer on stream_b ... + + # Before destroying buffer, make stream_a wait for stream_b's work + done_event = dev.create_event(options={"enable_timing": False}) + stream_b.record(done_event) + stream_a.wait(done_event) + + # Now safe to destroy buffer + del buffer + # [/buffer-lifetime] + # isort: on + + +def numba_stream_example() -> None: + try: + from numba import cuda + except ImportError: + print("Numba not available, skipping numba_stream_example") + return + + # isort: off + # [numba-stream] + import rmm + from rmm.pylibrmm.stream import Stream + from cuda.core import Device + from numba import cuda + + dev = Device() + dev.set_current() + + @cuda.jit + def kernel(data, n): + idx = cuda.grid(1) + if idx < n: + data[idx] = idx * 2 + + mr = rmm.mr.CudaAsyncMemoryResource() + stream = dev.create_stream() + + buffer = rmm.DeviceBuffer(size=1000 * 4, stream=Stream(obj=stream), mr=mr) + + numba_stream = cuda.external_stream(int(stream.handle)) + kernel[100, 10, numba_stream]( + cuda.as_cuda_array(buffer).view("float32"), 1000 + ) + + stream.sync() + # [/numba-stream] + # isort: on + + +if __name__ == "__main__": + how_it_works() + reading_results() + cross_stream() + buffer_lifetime() + numba_stream_example() + + print("All stream_ordered_allocation examples passed.") diff --git a/python/rmm/rmm/tests/test_doc_examples.py b/python/rmm/rmm/tests/test_doc_examples.py new file mode 100644 index 000000000..6130b3876 --- /dev/null +++ b/python/rmm/rmm/tests/test_doc_examples.py @@ -0,0 +1,29 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION. +# SPDX-License-Identifier: Apache-2.0 + +import subprocess +import sys +from pathlib import Path + +import pytest + +EXAMPLES_DIR = Path(__file__).parent / "examples" + +EXAMPLE_SCRIPTS = sorted(EXAMPLES_DIR.glob("*.py")) + + +@pytest.mark.parametrize( + "script", + EXAMPLE_SCRIPTS, + ids=[s.stem for s in EXAMPLE_SCRIPTS], +) +def test_doc_example(script): + result = subprocess.run( + [sys.executable, str(script)], + capture_output=True, + text=True, + timeout=120, + ) + assert result.returncode == 0, ( + f"{script.name} failed (exit {result.returncode}):\n{result.stderr}" + )