diff --git a/modelexpress_client/python/modelexpress/load_strategy/rdma_strategy.py b/modelexpress_client/python/modelexpress/load_strategy/rdma_strategy.py
index 8d475155..1ff4ef6c 100644
--- a/modelexpress_client/python/modelexpress/load_strategy/rdma_strategy.py
+++ b/modelexpress_client/python/modelexpress/load_strategy/rdma_strategy.py
@@ -15,7 +15,7 @@
 import torch.nn as nn
 
 from .base import LoadContext, LoadStrategy, SourceTransferError, register_tensors, publish_metadata
-from ..nixl_transfer import is_nixl_available
+from ..nixl_transfer import RegionLayoutMismatchError, is_nixl_available
 from ..tensor_utils import capture_tensor_attrs
 from ..transfer_safety import check_transfer_allowed
 from ..types import ManifestMismatchError, TensorDescriptor
@@ -270,6 +270,14 @@ def _receive_from_peer(
                 coalesce_transfers=coalesce,
                 remote_agent_name=remote_agent_name_override,
             )
+        except RegionLayoutMismatchError as e:
+            # Contiguous-region layouts differ between source and this worker
+            # (PyTorch CUDA allocator non-determinism). The source itself is
+            # healthy; raise ManifestMismatchError so the caller just tries
+            # the next candidate without marking the source STALE.
+            raise ManifestMismatchError(
+                f"region layout mismatch with source: {e}"
+            ) from e
         except Exception as e:
             raise SourceTransferError(f"RDMA receive failed: {e}") from e
         transfer_time = time.perf_counter() - transfer_start
diff --git a/modelexpress_client/python/modelexpress/nixl_transfer.py b/modelexpress_client/python/modelexpress/nixl_transfer.py
index f1245fc1..00f1a838 100644
--- a/modelexpress_client/python/modelexpress/nixl_transfer.py
+++ b/modelexpress_client/python/modelexpress/nixl_transfer.py
@@ -39,6 +39,27 @@ def is_nixl_available() -> bool:
     return NIXL_AVAILABLE
 
 
+class RegionLayoutMismatchError(Exception):
+    """Source and local contiguous-region layouts disagree.
+
+    The recv side registered a different set of contiguous memory regions
+    than the source (different region count, or same count but different
+    region sizes). This happens because PyTorch's CUDA caching allocator is
+    not deterministic across processes, so two workers loading the same
+    model weights can produce different region groupings.
+
+    The transfer cannot proceed in region mode because NIXL requires that
+    the N-th local region exactly matches the N-th remote region in size.
+    Callers should try the next source candidate (which may share our
+    layout) or fall through to a non-RDMA loading strategy.
+    """
+
+
+# Internal alias kept short for the raise site; re-exported via the public
+# name above for callers that want to catch it.
+_RegionLayoutMismatchError = RegionLayoutMismatchError
+
+
 class NixlTransferManager:
     """
     Manages a single NIXL agent and RDMA transfers for GPU tensors.
@@ -271,6 +292,61 @@ def _find_contiguous_regions(
 
         return regions
 
+    @staticmethod
+    def _validate_region_layout_match(
+        source_regions: list[TensorDescriptor],
+        local_regions: list[tuple[int, int]],
+    ) -> tuple[bool, str]:
+        """Check that two contiguous-region layouts agree on count and sizes.
+
+        Two layouts match iff they have the same number of regions and each
+        pair has the same size. (Addresses legitimately differ — they're
+        virtual addresses in different processes.)
+
+        Args:
+            source_regions: Region descriptors received from the source.
+            local_regions: ``(addr, size)`` tuples this worker registered.
+
+        Returns:
+            ``(True, "")`` when the layouts match; otherwise
+            ``(False, human_readable_summary)`` describing the first few
+            mismatches so callers can include it in an error message.
+        """
+        if len(source_regions) != len(local_regions):
+            return False, (
+                f"region count mismatch: source has {len(source_regions)}, "
+                f"local has {len(local_regions)} "
+                "(PyTorch CUDA allocator non-determinism produced different "
+                "contiguous-region groupings across processes)"
+            )
+
+        size_mismatches: list[tuple[int, int, int]] = []
+        for i, (src_region, (_local_addr, local_size)) in enumerate(
+            zip(source_regions, local_regions, strict=True)
+        ):
+            if src_region.size != local_size:
+                size_mismatches.append((i, src_region.size, local_size))
+
+        if size_mismatches:
+            # Log just the first few so errors stay readable.
+            head = size_mismatches[:5]
+            sample = ", ".join(
+                f"region {i}: source={s} local={l}" for i, s, l in head
+            )
+            suffix = (
+                f" (+{len(size_mismatches) - len(head)} more)"
+                if len(size_mismatches) > len(head)
+                else ""
+            )
+            return False, (
+                f"{len(size_mismatches)} region size mismatch(es) "
+                f"out of {len(source_regions)}: {sample}{suffix} "
+                "(PyTorch CUDA allocator non-determinism produced different "
+                "contiguous-region groupings across processes)"
+            )
+
+        return True, ""
+
     def fetch_remote_and_wait(
         self,
         remote_agent_name: str,
@@ -361,27 +437,38 @@ def receive_from_source(
 
             logger.info(f"Region-based transfer: {len(source_tensors)} source regions -> {len(self._registered_regions)} local regions")
 
-            # Validate region counts match
-            if len(source_tensors) != len(self._registered_regions):
-                logger.warning(
-                    f"Region count mismatch: source has {len(source_tensors)}, "
-                    f"local has {len(self._registered_regions)}. Proceeding with min."
-                )
+            # Validate the two region layouts are identical before matching.
+            #
+            # The contiguous-region layout is derived from each process's
+            # PyTorch CUDA allocator state, which is NOT deterministic across
+            # processes: fragmentation pattern, temporary buffers, and pinned-
+            # memory state vary between runs. Two workers that load the same
+            # model weights can therefore produce different region groupings.
+            #
+            # If source and local layouts disagree (region count differs, or
+            # any region size differs), naively pairing by index produces
+            # mismatched (src, local) pairs. NIXL will reject these with
+            # `makeXferReq: length mismatch ... NIXL_ERR_INVALID_PARAM` and
+            # abort the entire transfer. Previously we logged per-region
+            # WARNINGs but proceeded anyway; now we fail fast with a typed
+            # exception so the caller can try the next source candidate or
+            # fall through to a non-RDMA strategy.
+            layout_matches, mismatch_summary = self._validate_region_layout_match(
+                source_tensors, self._registered_regions,
+            )
+            if not layout_matches:
+                raise _RegionLayoutMismatchError(mismatch_summary)
 
-            # Build transfer lists by region index
+            # Build transfer lists by region index (layouts are validated equal)
             remote_descs = []
             local_descs = []  # Will be (addr, size, device_id) tuples
             total_bytes = 0
-            matched_count = min(len(source_tensors), len(self._registered_regions))
+            matched_count = len(source_tensors)
 
             for i in range(matched_count):
                 src_region = source_tensors[i]
                 local_addr, local_size = self._registered_regions[i]
 
-                # Verify sizes match (regions should be same size)
-                if src_region.size != local_size:
-                    logger.warning(f"Region {i} size mismatch: source={src_region.size}, local={local_size}")
-
                 remote_descs.append((src_region.addr, src_region.size, src_region.device_id))
                 local_descs.append((local_addr, local_size, self._device_id))
                 total_bytes += src_region.size
diff --git a/modelexpress_client/python/tests/test_nixl_transfer.py b/modelexpress_client/python/tests/test_nixl_transfer.py
new file mode 100644
index 00000000..2eb658fc
--- /dev/null
+++ b/modelexpress_client/python/tests/test_nixl_transfer.py
@@ -0,0 +1,121 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+
+"""Unit tests for nixl_transfer region-layout validation.
+
+These tests exercise only pure-Python logic and do NOT require NIXL, CUDA,
+or a GPU, so they can run in CI on any worker.
+"""
+
+from modelexpress.nixl_transfer import (
+    NixlTransferManager,
+    RegionLayoutMismatchError,
+)
+from modelexpress.types import TensorDescriptor
+
+
+def _region_desc(i: int, addr: int, size: int) -> TensorDescriptor:
+    """Build a region-style TensorDescriptor as emitted by the source side."""
+    return TensorDescriptor(
+        name=f"__region_{i}__",
+        addr=addr,
+        size=size,
+        device_id=0,
+        dtype="contiguous_region",
+    )
+
+
+class TestValidateRegionLayoutMatch:
+    """Tests for NixlTransferManager._validate_region_layout_match.
+
+    This guards against the bug where mismatched region layouts between
+    source and recv produced `(src, local)` pairs of unequal length that
+    NIXL rejected with NIXL_ERR_INVALID_PARAM after having silently logged
+    per-region WARNINGs.
+    """
+
+    def test_identical_layouts_match(self):
+        """Equal count and equal sizes (addresses may differ) -> match."""
+        source = [
+            _region_desc(0, 0x10000, 1024),
+            _region_desc(1, 0x20000, 2048),
+            _region_desc(2, 0x30000, 512),
+        ]
+        # Local addresses intentionally different — they're VAs in another process.
+        local = [(0xAA000, 1024), (0xBB000, 2048), (0xCC000, 512)]
+
+        ok, msg = NixlTransferManager._validate_region_layout_match(source, local)
+        assert ok is True
+        assert msg == ""
+
+    def test_count_mismatch_does_not_match(self):
+        """Different region counts must be rejected."""
+        source = [_region_desc(i, 0x10000 * (i + 1), 1024) for i in range(3)]
+        local = [(0xA000, 1024), (0xB000, 1024)]
+
+        ok, msg = NixlTransferManager._validate_region_layout_match(source, local)
+        assert ok is False
+        assert "region count mismatch" in msg
+        assert "3" in msg and "2" in msg
+
+    def test_size_mismatch_does_not_match(self):
+        """Same count but one region differs in size -> rejected."""
+        source = [
+            _region_desc(0, 0x10000, 1024),
+            _region_desc(1, 0x20000, 2048),
+            _region_desc(2, 0x30000, 512),
+        ]
+        local = [(0xA000, 1024), (0xB000, 9999), (0xC000, 512)]  # index 1 diverges
+
+        ok, msg = NixlTransferManager._validate_region_layout_match(source, local)
+        assert ok is False
+        assert "size mismatch" in msg
+        assert "region 1" in msg
+        assert "2048" in msg
+        assert "9999" in msg
+
+    def test_size_mismatch_summary_caps_output(self):
+        """Many mismatches produce a bounded summary, not spam per region."""
+        # 20 regions, all size-mismatched
+        source = [_region_desc(i, 0x1000 * (i + 1), 1024) for i in range(20)]
+        local = [(0x10000 + 0x1000 * i, 4096) for i in range(20)]
+
+        ok, msg = NixlTransferManager._validate_region_layout_match(source, local)
+        assert ok is False
+        # Summary should name the total but not list every one of the 20.
+        assert "20 region size mismatch" in msg
+        # Expect a "+N more" suffix indicating truncation.
+        assert "more" in msg
+
+    def test_empty_layouts_match_trivially(self):
+        """Two empty layouts are (vacuously) equal."""
+        ok, msg = NixlTransferManager._validate_region_layout_match([], [])
+        assert ok is True
+        assert msg == ""
+
+    def test_regression_logged_failure_from_llama31_8b(self):
+        """Regression test matching the exact scenario from the 2026-04-20 log.
+
+        Source produced 223 regions, recv produced 219 regions, with specific
+        size mismatches at indices 84, 85, 88-92, 216-218. Under the old
+        code this proceeded anyway and NIXL rejected the transfer. Under
+        the fix it must raise RegionLayoutMismatchError.
+        """
+        # Build two layouts with different counts (223 vs 219)
+        source = [_region_desc(i, 0x10000 * (i + 1), 33554432) for i in range(223)]
+        local = [(0xA0000 + i * 0x1000, 33554432) for i in range(219)]
+
+        ok, msg = NixlTransferManager._validate_region_layout_match(source, local)
+        assert ok is False
+        assert "region count mismatch" in msg
+        assert "223" in msg
+        assert "219" in msg
+
+
+class TestRegionLayoutMismatchError:
+    """The exception used to signal layout disagreement to callers."""
+
+    def test_is_exception(self):
+        err = RegionLayoutMismatchError("layouts differ")
+        assert isinstance(err, Exception)
+        assert "layouts differ" in str(err)