From 98ce4a217a4ac3a50b62839832d499d405497562 Mon Sep 17 00:00:00 2001 From: Devasena Inupakutika Date: Sat, 9 May 2026 04:38:10 +0000 Subject: [PATCH] Fix #363: pass results_dir to collect_cluster_info Signed-off-by: Devasena Inupakutika --- mlpstorage_py/benchmarks/base.py | 11 ++- mlpstorage_py/tests/test_benchmarks.py | 128 ++++++++++++++++++++++++- 2 files changed, 135 insertions(+), 4 deletions(-) diff --git a/mlpstorage_py/benchmarks/base.py b/mlpstorage_py/benchmarks/base.py index cd820e90..ea9ce231 100755 --- a/mlpstorage_py/benchmarks/base.py +++ b/mlpstorage_py/benchmarks/base.py @@ -442,15 +442,22 @@ def _collect_cluster_information(self) -> 'ClusterInformation': mpi_bin = getattr(self.args, 'mpi_bin', 'mpirun') allow_run_as_root = getattr(self.args, 'allow_run_as_root', False) timeout = getattr(self.args, 'cluster_collection_timeout', 60) + ssh_username = getattr(self.args, 'ssh_username', None) + shared_staging_dir = getattr(self.args, 'shared_staging_dir', None) - # Collect cluster info + # Collect cluster info. ``results_dir`` is required by + # ``collect_cluster_info`` for staging the helper script under + # ``/collector-staging/`` (see issue #363). collected_data = collect_cluster_info( hosts=self.args.hosts, mpi_bin=mpi_bin, logger=self.logger, + results_dir=self.run_result_output, allow_run_as_root=allow_run_as_root, timeout_seconds=timeout, - fallback_to_local=True + fallback_to_local=True, + shared_staging_dir=shared_staging_dir, + ssh_username=ssh_username, ) # Create ClusterInformation from collected data diff --git a/mlpstorage_py/tests/test_benchmarks.py b/mlpstorage_py/tests/test_benchmarks.py index e11e92ce..92a7beb3 100755 --- a/mlpstorage_py/tests/test_benchmarks.py +++ b/mlpstorage_py/tests/test_benchmarks.py @@ -218,20 +218,30 @@ def _run(self): benchmark = TestBenchmark.__new__(TestBenchmark) benchmark.args = base_args benchmark.logger = mock_logger + # ``run_result_output`` is normally set in ``Benchmark.__init__`` + # via ``generate_output_location()``. We patched ``__init__`` + # away, so set it explicitly so the call site has a results dir + # to forward to ``collect_cluster_info`` (issue #363). + benchmark.run_result_output = '/tmp/results/run-001' with patch('mlpstorage_py.benchmarks.base.collect_cluster_info') as mock_collect: mock_collect.return_value = mock_collected_data result = benchmark._collect_cluster_information() - # Verify collect_cluster_info was called with correct args + # Verify collect_cluster_info was called with correct args. + # ``results_dir`` is REQUIRED by collect_cluster_info; missing + # it was the root cause of issue #363. mock_collect.assert_called_once_with( hosts=['host1', 'host2'], mpi_bin='mpirun', logger=mock_logger, + results_dir='/tmp/results/run-001', allow_run_as_root=False, timeout_seconds=60, - fallback_to_local=True + fallback_to_local=True, + shared_staging_dir=None, + ssh_username=None, ) # Verify result is a ClusterInformation instance @@ -260,6 +270,120 @@ def _run(self): assert result is None +# ============================================================================= +# Regression tests for issue #363 +# ============================================================================= +# The original bug was that ``Benchmark._collect_cluster_information`` called +# ``collect_cluster_info`` without the required ``results_dir`` argument. Every +# pre-existing test patched ``collect_cluster_info`` away, so the missing-arg +# ``TypeError`` never surfaced. The tests below validate the call against the +# *real* function signature so future signature drift is caught at unit-test +# time. + +class TestCollectClusterInfoSignatureBinding: + """Issue #363: guard ``_collect_cluster_information`` against signature drift.""" + + def test_call_binds_to_real_collect_cluster_info_signature( + self, base_args, mock_logger + ): + """The kwargs passed by ``_collect_cluster_information`` must bind to + the real ``collect_cluster_info`` signature without raising + ``TypeError`` for missing required arguments. + + This is what would have caught issue #363 before merge. + """ + import inspect + from mlpstorage_py.benchmarks.base import Benchmark + from mlpstorage_py.cluster_collector import collect_cluster_info + + class TestBenchmark(Benchmark): + BENCHMARK_TYPE = BENCHMARK_TYPES.training + def _run(self): + pass + + sig = inspect.signature(collect_cluster_info) + captured_kwargs = {} + + def capture(*args, **kwargs): + # Reject positional shadowing — the call site is keyword-only. + assert not args, "call site should use keyword arguments only" + captured_kwargs.update(kwargs) + # Validate against the REAL signature; this raises TypeError if + # any required parameter (e.g., ``results_dir``) is missing. + sig.bind(**kwargs) + return { + 'host1': {'hostname': 'host1', 'meminfo': {'MemTotal': 16384000}}, + '_metadata': { + 'collection_method': 'mpi', + 'collection_timestamp': '2024-01-01T00:00:00Z', + }, + } + + with patch.object(TestBenchmark, '__init__', lambda x, *a, **kw: None): + benchmark = TestBenchmark.__new__(TestBenchmark) + benchmark.args = base_args + benchmark.logger = mock_logger + benchmark.run_result_output = '/tmp/results/run-001' + + with patch( + 'mlpstorage_py.benchmarks.base.collect_cluster_info', + side_effect=capture, + ): + benchmark._collect_cluster_information() + + # ``results_dir`` is the parameter that was missing in issue #363. + assert 'results_dir' in captured_kwargs + assert captured_kwargs['results_dir'] == '/tmp/results/run-001' + + def test_warning_message_from_issue_363_is_not_emitted( + self, base_args, mock_logger + ): + """The exact warning ``MPI cluster info collection failed: + collect_cluster_info() missing 1 required positional argument: + 'results_dir'`` must NOT appear after the fix. + """ + from mlpstorage_py.benchmarks.base import Benchmark + + class TestBenchmark(Benchmark): + BENCHMARK_TYPE = BENCHMARK_TYPES.training + def _run(self): + pass + + warnings_seen = [] + + class CapturingLogger(MockLogger): + def warning(self, msg): + warnings_seen.append(msg) + + with patch.object(TestBenchmark, '__init__', lambda x, *a, **kw: None): + benchmark = TestBenchmark.__new__(TestBenchmark) + benchmark.args = base_args + benchmark.logger = CapturingLogger() + benchmark.run_result_output = '/tmp/results/run-001' + + # Use the REAL ``collect_cluster_info`` but stub out the heavy + # ``MPIClusterCollector`` so we don't need an actual cluster. + with patch( + 'mlpstorage_py.cluster_collector.MPIClusterCollector' + ) as mock_collector_cls: + mock_instance = MagicMock() + mock_instance.collect.return_value = { + 'host1': {'hostname': 'host1', 'meminfo': {'MemTotal': 16384000}}, + } + mock_collector_cls.return_value = mock_instance + + benchmark._collect_cluster_information() + + offending = [ + w for w in warnings_seen + if 'missing 1 required positional argument' in w + and 'results_dir' in w + ] + assert offending == [], ( + f"Issue #363 warning regressed: {offending}" + ) + + # ============================================================================= # Tests for DLIOBenchmark.accumulate_host_info # =============================================================================