Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-129_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ dependencies:
- c-compiler
- cloudpickle
- cmake>=3.26.4,!=3.30.0
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
- cuda-core>=0.3.2
- cuda-cudart-dev
- cuda-nvcc
Expand All @@ -28,7 +30,6 @@ dependencies:
- ninja
- numba-cuda>=0.22.1,<0.29.0
- numpy>=1.23,<3.0
- nvidia-ml-py>=12
- pip
- pkg-config
- pre-commit
Expand Down
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-129_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ dependencies:
- c-compiler
- cloudpickle
- cmake>=3.26.4,!=3.30.0
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
- cuda-core>=0.3.2
- cuda-cudart-dev
- cuda-nvcc
Expand All @@ -28,7 +30,6 @@ dependencies:
- ninja
- numba-cuda>=0.22.1,<0.29.0
- numpy>=1.23,<3.0
- nvidia-ml-py>=12
- pip
- pkg-config
- pre-commit
Expand Down
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-131_arch-aarch64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ dependencies:
- c-compiler
- cloudpickle
- cmake>=3.26.4,!=3.30.0
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
- cuda-core>=0.3.2
- cuda-cudart-dev
- cuda-nvcc
Expand All @@ -28,7 +30,6 @@ dependencies:
- ninja
- numba-cuda>=0.22.1,<0.29.0
- numpy>=1.23,<3.0
- nvidia-ml-py>=12
- pip
- pkg-config
- pre-commit
Expand Down
3 changes: 2 additions & 1 deletion conda/environments/all_cuda-131_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,8 @@ dependencies:
- c-compiler
- cloudpickle
- cmake>=3.26.4,!=3.30.0
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
- cuda-core>=0.3.2
- cuda-cudart-dev
- cuda-nvcc
Expand All @@ -28,7 +30,6 @@ dependencies:
- ninja
- numba-cuda>=0.22.1,<0.29.0
- numpy>=1.23,<3.0
- nvidia-ml-py>=12
- pip
- pkg-config
- pre-commit
Expand Down
11 changes: 6 additions & 5 deletions conda/recipes/ucxx/recipe.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,6 @@ outputs:
host:
- cuda-version =${{ cuda_version }}
- cython >=3.2.2
# 'nvidia-ml-py' provides the 'pynvml' module
- nvidia-ml-py>=12
- pip
- python =${{ py_abi_min }}
- python-abi3 ${{ py_abi_min }}.*
Expand All @@ -98,16 +96,19 @@ outputs:
- ucx
- libucxx =${{ version }}
- cuda-cudart-dev
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
# TODO: Change to cuda-core >= 1.0.0 once that's released
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
run:
- cuda-core >=0.3.2
- numpy >=1.23,<3.0
# 'nvidia-ml-py' provides the 'pynvml' module
- nvidia-ml-py>=12
- python
- ucx >=1.18.0,<1.21.0
- ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }}
- ${{ pin_compatible("rmm", upper_bound="x.x") }}
- libucxx =${{ version }}
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
# TODO: Change to cuda-core >= 1.0.0 once that's released
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
run_constraints:
- cupy >=13.6.0
ignore_run_exports:
Expand Down
6 changes: 3 additions & 3 deletions dependencies.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -346,9 +346,9 @@ dependencies:
- output_types: [conda, requirements, pyproject]
packages:
- &numpy numpy>=1.23,<3.0
# 'nvidia-ml-py' provides the 'pynvml' module
- nvidia-ml-py>=12
- cuda-core>=0.3.2
- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*
# TODO: Change to cuda-core >= 1.0.0 once that's released
- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core
run_python_distributed_ucxx:
common:
- output_types: [conda, requirements, pyproject]
Expand Down
4 changes: 2 additions & 2 deletions python/ucxx/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,10 @@ authors = [
license = "BSD-3-Clause"
requires-python = ">=3.11"
dependencies = [
"cuda-core>=0.3.2",
"cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*",
"cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core",
"libucxx==0.50.*,>=0.0.0a0",
"numpy>=1.23,<3.0",
"nvidia-ml-py>=12",
"rmm==26.6.*,>=0.0.0a0",
] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
classifiers = [
Expand Down
36 changes: 11 additions & 25 deletions python/ucxx/ucxx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,10 +33,7 @@
from .core import * # noqa: E402, F403
from .utils import get_address, get_ucxpy_logger # noqa: E402

try:
import pynvml
except ImportError:
pynvml = None
from cuda.core import system

_ucx_version = get_ucx_version() # noqa: F405
__ucx_min_version__ = "1.18.0"
Expand All @@ -62,41 +59,30 @@
os.environ["UCX_RNDV_FRAG_MEM_TYPE"] = "cuda"

if (
pynvml is not None
and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ
and _ucx_version >= (1, 12, 0)
):
try:
pynvml.nvmlInit()
device_count = pynvml.nvmlDeviceGetCount()
device_count = system.Device.get_device_count()
large_bar1 = [False] * device_count

def _is_mig_device(handle):
try:
pynvml.nvmlDeviceGetMigMode(handle)[0]
except pynvml.NVMLError:
return False
return True

for dev_idx in range(device_count):
handle = pynvml.nvmlDeviceGetHandleByIndex(dev_idx)

for dev_idx, device in enumerate(system.Device.get_all_devices()):
try:
total_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total
except pynvml.NVMLError_NotSupported:
total_memory = device.memory_info.total
except system.NotSupportedError:
total_memory = None

# Ignore MIG devices and devices with no memory resource (i.e., only
# integrated CPU+GPU memory resource) and rely on UCX's default for
# now. Increasing `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly
# tested, as it's not yet clear whether it would be safe to set `1.0`
# for those instances too.
if _is_mig_device(handle) or total_memory is None:
if device.mig.is_mig_device or total_memory is None:
continue

try:
bar1_total = pynvml.nvmlDeviceGetBAR1MemoryInfo(handle).bar1Total
except pynvml.NVMLError_NotSupported:
bar1_total = device.bar1_memory_info.total
except system.NotSupportedError:
# Bar1 access not supported on this device, set it to
# zero (always lower than device memory).
bar1_total = 0
Expand All @@ -108,9 +94,9 @@ def _is_mig_device(handle):
logger.info("Setting UCX_CUDA_COPY_MAX_REG_RATIO=1.0")
os.environ["UCX_CUDA_COPY_MAX_REG_RATIO"] = "1.0"
except (
pynvml.NVMLError_LibraryNotFound,
pynvml.NVMLError_DriverNotLoaded,
pynvml.NVMLError_Unknown,
system.NotFoundError,
system.DriverNotLoadedError,
system.UnknownError,
):
pass

Expand Down
10 changes: 4 additions & 6 deletions python/ucxx/ucxx/_lib_async/utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,9 @@

from ucxx._lib_async.pytest_stash_keys import ASYNCIO_PLUGIN_TIMEOUT_STASH_KEY

from cuda.core import system


normal_env = {
"UCX_RNDV_SCHEME": "put_zcopy",
"UCX_MEMTYPE_CACHE": "n",
Expand All @@ -27,12 +30,7 @@ def set_env():


def get_num_gpus():
import pynvml

pynvml.nvmlInit()
ngpus = pynvml.nvmlDeviceGetCount()
pynvml.nvmlShutdown()
return ngpus
return system.Device.get_device_count()


def get_cuda_devices():
Expand Down
Loading