diff --git a/conda/environments/all_cuda-129_arch-aarch64.yaml b/conda/environments/all_cuda-129_arch-aarch64.yaml index 3047261a..d72a9d66 100644 --- a/conda/environments/all_cuda-129_arch-aarch64.yaml +++ b/conda/environments/all_cuda-129_arch-aarch64.yaml @@ -10,6 +10,8 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* +- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core - cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc @@ -28,7 +30,6 @@ dependencies: - ninja - numba-cuda>=0.22.1,<0.29.0 - numpy>=1.23,<3.0 -- nvidia-ml-py>=12 - pip - pkg-config - pre-commit diff --git a/conda/environments/all_cuda-129_arch-x86_64.yaml b/conda/environments/all_cuda-129_arch-x86_64.yaml index ab416cff..d2d08b7b 100644 --- a/conda/environments/all_cuda-129_arch-x86_64.yaml +++ b/conda/environments/all_cuda-129_arch-x86_64.yaml @@ -10,6 +10,8 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* +- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core - cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc @@ -28,7 +30,6 @@ dependencies: - ninja - numba-cuda>=0.22.1,<0.29.0 - numpy>=1.23,<3.0 -- nvidia-ml-py>=12 - pip - pkg-config - pre-commit diff --git a/conda/environments/all_cuda-131_arch-aarch64.yaml b/conda/environments/all_cuda-131_arch-aarch64.yaml index 83114c6b..f601417d 100644 --- a/conda/environments/all_cuda-131_arch-aarch64.yaml +++ b/conda/environments/all_cuda-131_arch-aarch64.yaml @@ -10,6 +10,8 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* +- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core - cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc @@ -28,7 +30,6 @@ dependencies: - ninja - numba-cuda>=0.22.1,<0.29.0 - numpy>=1.23,<3.0 -- nvidia-ml-py>=12 - pip - pkg-config - pre-commit diff --git a/conda/environments/all_cuda-131_arch-x86_64.yaml b/conda/environments/all_cuda-131_arch-x86_64.yaml index b4484732..ab6b3808 100644 --- a/conda/environments/all_cuda-131_arch-x86_64.yaml +++ b/conda/environments/all_cuda-131_arch-x86_64.yaml @@ -10,6 +10,8 @@ dependencies: - c-compiler - cloudpickle - cmake>=3.26.4,!=3.30.0 +- cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* +- cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core - cuda-core>=0.3.2 - cuda-cudart-dev - cuda-nvcc @@ -28,7 +30,6 @@ dependencies: - ninja - numba-cuda>=0.22.1,<0.29.0 - numpy>=1.23,<3.0 -- nvidia-ml-py>=12 - pip - pkg-config - pre-commit diff --git a/conda/recipes/ucxx/recipe.yaml b/conda/recipes/ucxx/recipe.yaml index 02ddcce1..9e473762 100644 --- a/conda/recipes/ucxx/recipe.yaml +++ b/conda/recipes/ucxx/recipe.yaml @@ -87,8 +87,6 @@ outputs: host: - cuda-version =${{ cuda_version }} - cython >=3.2.2 - # 'nvidia-ml-py' provides the 'pynvml' module - - nvidia-ml-py>=12 - pip - python =${{ py_abi_min }} - python-abi3 ${{ py_abi_min }}.* @@ -98,16 +96,19 @@ outputs: - ucx - libucxx =${{ version }} - cuda-cudart-dev + - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* + # TODO: Change to cuda-core >= 1.0.0 once that's released + - cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core run: - - cuda-core >=0.3.2 - numpy >=1.23,<3.0 - # 'nvidia-ml-py' provides the 'pynvml' module - - nvidia-ml-py>=12 - python - ucx >=1.18.0,<1.21.0 - ${{ pin_compatible("cuda-version", upper_bound="x", lower_bound="x") }} - ${{ pin_compatible("rmm", upper_bound="x.x") }} - libucxx =${{ version }} + - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* + # TODO: Change to cuda-core >= 1.0.0 once that's released + - cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core run_constraints: - cupy >=13.6.0 ignore_run_exports: diff --git a/dependencies.yaml b/dependencies.yaml index 5f1b91d7..fe0ee312 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -346,9 +346,9 @@ dependencies: - output_types: [conda, requirements, pyproject] packages: - &numpy numpy>=1.23,<3.0 - # 'nvidia-ml-py' provides the 'pynvml' module - - nvidia-ml-py>=12 - - cuda-core>=0.3.2 + - cuda-bindings>=12.9.6,!=13.0.*,!=13.1.* + # TODO: Change to cuda-core >= 1.0.0 once that's released + - cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core run_python_distributed_ucxx: common: - output_types: [conda, requirements, pyproject] diff --git a/python/ucxx/pyproject.toml b/python/ucxx/pyproject.toml index 15de3d59..6fe6993d 100644 --- a/python/ucxx/pyproject.toml +++ b/python/ucxx/pyproject.toml @@ -19,10 +19,10 @@ authors = [ license = "BSD-3-Clause" requires-python = ">=3.11" dependencies = [ - "cuda-core>=0.3.2", + "cuda-bindings>=12.9.6,!=13.0.*,!=13.1.*", + "cuda-core @ git+https://github.com/nvidia/cuda-python@main#subdirectory=cuda_core", "libucxx==0.50.*,>=0.0.0a0", "numpy>=1.23,<3.0", - "nvidia-ml-py>=12", "rmm==26.6.*,>=0.0.0a0", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ diff --git a/python/ucxx/ucxx/__init__.py b/python/ucxx/ucxx/__init__.py index a727cbe4..5e07990e 100644 --- a/python/ucxx/ucxx/__init__.py +++ b/python/ucxx/ucxx/__init__.py @@ -33,10 +33,7 @@ from .core import * # noqa: E402, F403 from .utils import get_address, get_ucxpy_logger # noqa: E402 -try: - import pynvml -except ImportError: - pynvml = None +from cuda.core import system _ucx_version = get_ucx_version() # noqa: F405 __ucx_min_version__ = "1.18.0" @@ -62,28 +59,17 @@ os.environ["UCX_RNDV_FRAG_MEM_TYPE"] = "cuda" if ( - pynvml is not None and "UCX_CUDA_COPY_MAX_REG_RATIO" not in os.environ and _ucx_version >= (1, 12, 0) ): try: - pynvml.nvmlInit() - device_count = pynvml.nvmlDeviceGetCount() + device_count = system.Device.get_device_count() large_bar1 = [False] * device_count - def _is_mig_device(handle): - try: - pynvml.nvmlDeviceGetMigMode(handle)[0] - except pynvml.NVMLError: - return False - return True - - for dev_idx in range(device_count): - handle = pynvml.nvmlDeviceGetHandleByIndex(dev_idx) - + for dev_idx, device in enumerate(system.Device.get_all_devices()): try: - total_memory = pynvml.nvmlDeviceGetMemoryInfo(handle).total - except pynvml.NVMLError_NotSupported: + total_memory = device.memory_info.total + except system.NotSupportedError: total_memory = None # Ignore MIG devices and devices with no memory resource (i.e., only @@ -91,12 +77,12 @@ def _is_mig_device(handle): # now. Increasing `UCX_CUDA_COPY_MAX_REG_RATIO` should be thoroughly # tested, as it's not yet clear whether it would be safe to set `1.0` # for those instances too. - if _is_mig_device(handle) or total_memory is None: + if device.mig.is_mig_device or total_memory is None: continue try: - bar1_total = pynvml.nvmlDeviceGetBAR1MemoryInfo(handle).bar1Total - except pynvml.NVMLError_NotSupported: + bar1_total = device.bar1_memory_info.total + except system.NotSupportedError: # Bar1 access not supported on this device, set it to # zero (always lower than device memory). bar1_total = 0 @@ -108,9 +94,9 @@ def _is_mig_device(handle): logger.info("Setting UCX_CUDA_COPY_MAX_REG_RATIO=1.0") os.environ["UCX_CUDA_COPY_MAX_REG_RATIO"] = "1.0" except ( - pynvml.NVMLError_LibraryNotFound, - pynvml.NVMLError_DriverNotLoaded, - pynvml.NVMLError_Unknown, + system.NotFoundError, + system.DriverNotLoadedError, + system.UnknownError, ): pass diff --git a/python/ucxx/ucxx/_lib_async/utils_test.py b/python/ucxx/ucxx/_lib_async/utils_test.py index e6b6359e..da959961 100644 --- a/python/ucxx/ucxx/_lib_async/utils_test.py +++ b/python/ucxx/ucxx/_lib_async/utils_test.py @@ -14,6 +14,9 @@ from ucxx._lib_async.pytest_stash_keys import ASYNCIO_PLUGIN_TIMEOUT_STASH_KEY +from cuda.core import system + + normal_env = { "UCX_RNDV_SCHEME": "put_zcopy", "UCX_MEMTYPE_CACHE": "n", @@ -27,12 +30,7 @@ def set_env(): def get_num_gpus(): - import pynvml - - pynvml.nvmlInit() - ngpus = pynvml.nvmlDeviceGetCount() - pynvml.nvmlShutdown() - return ngpus + return system.Device.get_device_count() def get_cuda_devices():