_________________________ test_dask_ucxx_cluster_sync __________________________
Traceback (most recent call last):
File "/usr/local/lib/python3.12/dist-packages/_pytest/runner.py", line 344, in from_call
result: TResult | None = func()
^^^^^^
File "/usr/local/lib/python3.12/dist-packages/_pytest/runner.py", line 246, in <lambda>
lambda: runtest_hook(item=item, **kwds), when=when, reraise=reraise
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_hooks.py", line 512, in __call__
return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_manager.py", line 120, in _hookexec
return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 167, in _multicall
raise exception
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 139, in _multicall
teardown.throw(exception)
File "/usr/local/lib/python3.12/dist-packages/_pytest/logging.py", line 850, in pytest_runtest_call
yield
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 139, in _multicall
teardown.throw(exception)
File "/usr/local/lib/python3.12/dist-packages/_pytest/capture.py", line 900, in pytest_runtest_call
return (yield)
^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 139, in _multicall
teardown.throw(exception)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 53, in run_old_style_hookwrapper
return result.get_result()
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_result.py", line 103, in get_result
raise exc.with_traceback(tb)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 38, in run_old_style_hookwrapper
res = yield
^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 139, in _multicall
teardown.throw(exception)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 53, in run_old_style_hookwrapper
return result.get_result()
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_result.py", line 103, in get_result
raise exc.with_traceback(tb)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 38, in run_old_style_hookwrapper
res = yield
^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 139, in _multicall
teardown.throw(exception)
File "/usr/local/lib/python3.12/dist-packages/_pytest/skipping.py", line 263, in pytest_runtest_call
return (yield)
^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 121, in _multicall
res = hook_impl.function(*args)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/_pytest/runner.py", line 178, in pytest_runtest_call
item.runtest()
File "/usr/local/lib/python3.12/dist-packages/_pytest/python.py", line 1671, in runtest
self.ihook.pytest_pyfunc_call(pyfuncitem=self)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_hooks.py", line 512, in __call__
return self._hookexec(self.name, self._hookimpls.copy(), kwargs, firstresult)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_manager.py", line 120, in _hookexec
return self._inner_hookexec(hook_name, methods, kwargs, firstresult)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 167, in _multicall
raise exception
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 139, in _multicall
teardown.throw(exception)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 53, in run_old_style_hookwrapper
return result.get_result()
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_result.py", line 103, in get_result
raise exc.with_traceback(tb)
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 38, in run_old_style_hookwrapper
res = yield
^^^^^
File "/usr/local/lib/python3.12/dist-packages/pluggy/_callers.py", line 121, in _multicall
res = hook_impl.function(*args)
^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/_pytest/python.py", line 157, in pytest_pyfunc_call
result = testfunction(**testargs)
^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/contextlib.py", line 81, in inner
return func(*args, **kwds)
^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/distributed/utils_test.py", line 748, in test_func
return _run_and_close_tornado(async_fn_outer, func, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/distributed/utils_test.py", line 380, in _run_and_close_tornado
return asyncio_run(inner_fn(), loop_factory=get_loop_factory())
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 194, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/asyncio/base_events.py", line 687, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/distributed/utils_test.py", line 377, in inner_fn
return await async_fn(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/distributed/utils_test.py", line 738, in async_fn_outer
return await utils_wait_for(async_fn(*args, **kwargs), timeout)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/distributed/utils.py", line 1928, in wait_for
return await fut
^^^^^^^^^
File "/root/rapidsmpf/python/rapidsmpf/rapidsmpf/tests/test_dask.py", line 66, in test_dask_ucxx_cluster_sync
LocalCUDACluster(scheduler_port=0, device_memory_limit=1) as cluster,
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/dask_cuda/local_cuda_cluster.py", line 419, in __init__
self.scale(n_workers)
File "/usr/local/lib/python3.12/dist-packages/distributed/deploy/spec.py", line 534, in scale
self.worker_spec.update(self.new_worker_spec())
^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/dask_cuda/local_cuda_cluster.py", line 439, in new_worker_spec
**({"data": self.data(device_index)} if hasattr(self, "data") else {}),
^^^^^^^^^^^^^^^^^^^^^^^
File "/usr/local/lib/python3.12/dist-packages/dask_cuda/worker_common.py", line 69, in data
raise ValueError(
ValueError: device_memory_limit is set but device has no dedicated memory.
------------------------------ Captured log call -------------------------------
Description
Testing
rapidsmpf26.02 on a DGX Spark (GB10), I saw 1 failure in the Python unit tests:full stack trace (click me)
Reproducible Example
Ran the unit tests like this:
~/rapidsmpf/ci/run_pytests.shNotes
That error message was introduced here: rapidsai/dask-cuda#1505
device_memory_limitis still being set at test time onmain:rapidsmpf/python/rapidsmpf/rapidsmpf/tests/test_dask.py
Lines 68 to 72 in 24e2e52
So even though I observed this on 26.02, I expect it should be reproducible on
main... but haven't been able to get DGX Spark access again to test.