-
Notifications
You must be signed in to change notification settings - Fork 27
fix: fail fast on NIXL region layout mismatch in recv transfer #246
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
the-david-oy
wants to merge
1
commit into
ai-dynamo:main
Choose a base branch
from
the-david-oy:fix/region-transfer-size-mismatch
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,121 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
|
|
||
| """Unit tests for nixl_transfer region-layout validation. | ||
|
|
||
| These tests exercise only pure-Python logic and do NOT require NIXL, CUDA, | ||
| or a GPU, so they can run in CI on any worker. | ||
| """ | ||
|
|
||
| from modelexpress.nixl_transfer import ( | ||
| NixlTransferManager, | ||
| RegionLayoutMismatchError, | ||
| ) | ||
| from modelexpress.types import TensorDescriptor | ||
|
|
||
|
|
||
| def _region_desc(i: int, addr: int, size: int) -> TensorDescriptor: | ||
| """Build a region-style TensorDescriptor as emitted by the source side.""" | ||
| return TensorDescriptor( | ||
| name=f"__region_{i}__", | ||
| addr=addr, | ||
| size=size, | ||
| device_id=0, | ||
| dtype="contiguous_region", | ||
| ) | ||
|
|
||
|
|
||
| class TestValidateRegionLayoutMatch: | ||
| """Tests for NixlTransferManager._validate_region_layout_match. | ||
|
|
||
| This guards against the bug where mismatched region layouts between | ||
| source and recv produced `(src, local)` pairs of unequal length that | ||
| NIXL rejected with NIXL_ERR_INVALID_PARAM after having silently logged | ||
| per-region WARNINGs. | ||
| """ | ||
|
|
||
| def test_identical_layouts_match(self): | ||
| """Equal count and equal sizes (addresses may differ) -> match.""" | ||
| source = [ | ||
| _region_desc(0, 0x10000, 1024), | ||
| _region_desc(1, 0x20000, 2048), | ||
| _region_desc(2, 0x30000, 512), | ||
| ] | ||
| # Local addresses intentionally different — they're VAs in another process. | ||
| local = [(0xAA000, 1024), (0xBB000, 2048), (0xCC000, 512)] | ||
|
|
||
| ok, msg = NixlTransferManager._validate_region_layout_match(source, local) | ||
| assert ok is True | ||
| assert msg == "" | ||
|
|
||
| def test_count_mismatch_does_not_match(self): | ||
| """Different region counts must be rejected.""" | ||
| source = [_region_desc(i, 0x10000 * (i + 1), 1024) for i in range(3)] | ||
| local = [(0xA000, 1024), (0xB000, 1024)] | ||
|
|
||
| ok, msg = NixlTransferManager._validate_region_layout_match(source, local) | ||
| assert ok is False | ||
| assert "region count mismatch" in msg | ||
| assert "3" in msg and "2" in msg | ||
|
|
||
| def test_size_mismatch_does_not_match(self): | ||
| """Same count but one region differs in size -> rejected.""" | ||
| source = [ | ||
| _region_desc(0, 0x10000, 1024), | ||
| _region_desc(1, 0x20000, 2048), | ||
| _region_desc(2, 0x30000, 512), | ||
| ] | ||
| local = [(0xA000, 1024), (0xB000, 9999), (0xC000, 512)] # index 1 diverges | ||
|
|
||
| ok, msg = NixlTransferManager._validate_region_layout_match(source, local) | ||
| assert ok is False | ||
| assert "size mismatch" in msg | ||
| assert "region 1" in msg | ||
| assert "2048" in msg | ||
| assert "9999" in msg | ||
|
|
||
| def test_size_mismatch_summary_caps_output(self): | ||
| """Many mismatches produce a bounded summary, not spam per region.""" | ||
| # 20 regions, all size-mismatched | ||
| source = [_region_desc(i, 0x1000 * (i + 1), 1024) for i in range(20)] | ||
| local = [(0x10000 + 0x1000 * i, 4096) for i in range(20)] | ||
|
|
||
| ok, msg = NixlTransferManager._validate_region_layout_match(source, local) | ||
| assert ok is False | ||
| # Summary should name the total but not list every one of the 20. | ||
| assert "20 region size mismatch" in msg | ||
| # Expect a "+N more" suffix indicating truncation. | ||
| assert "more" in msg | ||
|
|
||
| def test_empty_layouts_match_trivially(self): | ||
| """Two empty layouts are (vacuously) equal.""" | ||
| ok, msg = NixlTransferManager._validate_region_layout_match([], []) | ||
| assert ok is True | ||
| assert msg == "" | ||
|
|
||
| def test_regression_logged_failure_from_llama31_8b(self): | ||
| """Regression test matching the exact scenario from the 2026-04-20 log. | ||
|
|
||
| Source produced 223 regions, recv produced 219 regions, with specific | ||
| size mismatches at indices 84, 85, 88-92, 216-218. Under the old | ||
| code this proceeded anyway and NIXL rejected the transfer. Under | ||
| the fix it must raise RegionLayoutMismatchError. | ||
| """ | ||
| # Build two layouts with different counts (223 vs 219) | ||
| source = [_region_desc(i, 0x10000 * (i + 1), 33554432) for i in range(223)] | ||
| local = [(0xA0000 + i * 0x1000, 33554432) for i in range(219)] | ||
|
|
||
| ok, msg = NixlTransferManager._validate_region_layout_match(source, local) | ||
| assert ok is False | ||
| assert "region count mismatch" in msg | ||
| assert "223" in msg | ||
| assert "219" in msg | ||
|
|
||
|
|
||
| class TestRegionLayoutMismatchError: | ||
| """The exception used to signal layout disagreement to callers.""" | ||
|
|
||
| def test_is_exception(self): | ||
| err = RegionLayoutMismatchError("layouts differ") | ||
| assert isinstance(err, Exception) | ||
| assert "layouts differ" in str(err) |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Rename single-letter
l— Ruff E741.Static analysis flags
las ambiguous (easily confused with1/I). Rename to something likelocorlocal_szin the unpacking and f-string.🔧 Proposed fix
🧰 Tools
🪛 Ruff (0.15.10)
[error] 334-334: Ambiguous variable name:
l(E741)
🤖 Prompt for AI Agents