TL/NCCL: add user buffer registration via memmap#1260
TL/NCCL: add user buffer registration via memmap#1260wfaderhold21 wants to merge 4 commits intoopenucx:masterfrom
Conversation
Greptile Overview
|
| Filename | Overview |
|---|---|
| src/components/tl/nccl/tl_nccl.h | Adds data structures and configuration for NCCL User Buffer Registration support. Clean header additions with proper version checks. |
| src/components/tl/nccl/tl_nccl.c | Adds configuration option ENABLE_UBR for NCCL User Buffer Registration. Minimal, straightforward changes. |
| src/components/tl/nccl/tl_nccl_context.c | Implements memory map/unmap/pack operations for NCCL UBR. Includes proper initialization checks and deregistration cleanup. IMPORT mode handling needs clarification. |
| src/components/tl/nccl/tl_nccl_coll.c | Implements lazy buffer registration for collectives with proper overflow checks, bounds validation, and variable-size collective handling. Addresses previous feedback. |
Last reviewed commit: d9b4756
| if (!new_comms) { | ||
| tl_error( | ||
| UCC_TL_TEAM_LIB(team), | ||
| "failed to allocate memory for registered comms array"); | ||
| /* Buffer is registered but we can't track it - this is a problem */ | ||
| return UCC_ERR_NO_MEMORY; | ||
| } | ||
| m_data->registered_comms = new_comms; | ||
|
|
||
| new_handles = (void **)ucc_realloc( | ||
| m_data->nccl_handles, new_max * sizeof(void *), "nccl_handles"); | ||
| if (!new_handles) { | ||
| tl_error( | ||
| UCC_TL_TEAM_LIB(team), | ||
| "failed to allocate memory for NCCL handles array"); | ||
| return UCC_ERR_NO_MEMORY; |
There was a problem hiding this comment.
resource leak: if ucc_realloc for new_comms succeeds but the second ucc_realloc for new_handles fails (line 240-244), the newly allocated new_comms is assigned to m_data->registered_comms (line 236) but the function returns UCC_ERR_NO_MEMORY without deregistering the NCCL buffer. This leaks the NCCL registration created at line 208.
| if (!new_comms) { | |
| tl_error( | |
| UCC_TL_TEAM_LIB(team), | |
| "failed to allocate memory for registered comms array"); | |
| /* Buffer is registered but we can't track it - this is a problem */ | |
| return UCC_ERR_NO_MEMORY; | |
| } | |
| m_data->registered_comms = new_comms; | |
| new_handles = (void **)ucc_realloc( | |
| m_data->nccl_handles, new_max * sizeof(void *), "nccl_handles"); | |
| if (!new_handles) { | |
| tl_error( | |
| UCC_TL_TEAM_LIB(team), | |
| "failed to allocate memory for NCCL handles array"); | |
| return UCC_ERR_NO_MEMORY; | |
| if (!new_handles) { | |
| /* Failed to grow handles array - must deregister to avoid leak */ | |
| ncclCommDeregister(team->nccl_comm, nccl_handle); | |
| tl_error( | |
| UCC_TL_TEAM_LIB(team), | |
| "failed to allocate memory for NCCL handles array"); | |
| return UCC_ERR_NO_MEMORY; | |
| } |
| status = ucc_tl_nccl_lazy_register_memh( | ||
| coll_args->args.src.info.buffer, | ||
| coll_args->args.src.info.count * | ||
| ucc_dt_size(coll_args->args.src.info.datatype), |
There was a problem hiding this comment.
buffer size calculation is incorrect for variable-size collectives (ALLTOALLV, ALLGATHERV, GATHERV, SCATTERV). For these collectives, coll_args->args.src uses the info_v union member with counts* array, not info.count scalar. Accessing .info.count reads the counts pointer as an integer, resulting in incorrect buffer size. Need to check coll_args->args.coll_type and use appropriate buffer size calculation for each collective type.
| status = ucc_tl_nccl_lazy_register_memh( | |
| coll_args->args.src.info.buffer, | |
| coll_args->args.src.info.count * | |
| ucc_dt_size(coll_args->args.src.info.datatype), | |
| // TODO: Need to calculate correct buffer size based on coll_type | |
| // For variable-size collectives (ALLTOALLV, ALLGATHERV, etc), | |
| // must sum counts array instead of using single count value | |
| status = ucc_tl_nccl_lazy_register_memh( | |
| coll_args->args.src.info.buffer, | |
| coll_args->args.src.info.count * | |
| ucc_dt_size(coll_args->args.src.info.datatype), |
| status = ucc_tl_nccl_lazy_register_memh( | ||
| coll_args->args.dst.info.buffer, | ||
| coll_args->args.dst.info.count * | ||
| ucc_dt_size(coll_args->args.dst.info.datatype), |
There was a problem hiding this comment.
same buffer size calculation bug for dst buffer - incorrect for variable-size collectives that use info_v.counts* array instead of info.count scalar
| buf_start = (uintptr_t)buffer; | ||
| buf_end = buf_start + length; |
There was a problem hiding this comment.
check for potential integer overflow when buf_start + length exceeds UINTPTR_MAX. Should validate that length <= UINTPTR_MAX - buf_start before addition to prevent wraparound
| buf_start = (uintptr_t)buffer; | |
| buf_end = buf_start + length; | |
| buf_start = (uintptr_t)buffer; | |
| if (length > UINTPTR_MAX - buf_start) { | |
| tl_error(UCC_TL_TEAM_LIB(team), "NCCL UBR: buffer size causes overflow"); | |
| return UCC_ERR_INVALID_PARAM; | |
| } | |
| buf_end = buf_start + length; |
|
|
||
| /* If UBR is not available/disabled or no TL data, return empty pack */ | ||
| if (!ctx->ubr_available || !tl_h || !tl_h->tl_data) { | ||
| tl_h->packed_size = 0; | ||
| *pack_buffer = NULL; | ||
| return UCC_OK; |
There was a problem hiding this comment.
Null deref on pack
In ucc_tl_nccl_memh_pack, the early-return condition includes !tl_h, but the block still does tl_h->packed_size = 0. If tl_h == NULL this will crash. This can trigger if the iface is ever invoked with a NULL tl_h (the function explicitly allows it in the condition), so the guard should avoid dereferencing tl_h in that path.
| ucc_status_t ucc_tl_nccl_mem_map( | ||
| const ucc_base_context_t *context, ucc_mem_map_mode_t mode, | ||
| ucc_mem_map_memh_t *memh, ucc_mem_map_tl_t *tl_h) | ||
| { | ||
| return UCC_ERR_NOT_SUPPORTED; | ||
| ucc_tl_nccl_context_t *ctx = ucc_derived_of(context, ucc_tl_nccl_context_t); | ||
| ucc_tl_nccl_memh_data_t *m_data; | ||
|
|
||
| /* Check if UBR is available and enabled */ | ||
| if (!ctx->ubr_available) { | ||
| tl_debug( | ||
| ctx->super.super.lib, "NCCL UBR not available, skipping mem_map"); | ||
| return UCC_ERR_NOT_SUPPORTED; | ||
| } | ||
|
|
||
| /* Support both EXPORT and IMPORT modes for global memh */ | ||
| if (mode != UCC_MEM_MAP_MODE_EXPORT && mode != UCC_MEM_MAP_MODE_IMPORT) { | ||
| tl_debug(ctx->super.super.lib, | ||
| "NCCL UBR: unsupported mode %d", mode); | ||
| return UCC_ERR_NOT_SUPPORTED; | ||
| } | ||
|
|
||
| /* Reject zero-length buffers */ | ||
| if (memh->len == 0) { | ||
| tl_debug(ctx->super.super.lib, | ||
| "NCCL UBR: zero-length buffer, skipping mem_map"); | ||
| return UCC_ERR_NOT_SUPPORTED; | ||
| } | ||
|
|
||
| /* Allocate TL-specific memory handle data */ | ||
| m_data = (ucc_tl_nccl_memh_data_t *)ucc_calloc( | ||
| 1, sizeof(ucc_tl_nccl_memh_data_t), "tl_nccl_memh_data"); | ||
| if (!m_data) { | ||
| tl_error( | ||
| ctx->super.super.lib, "failed to allocate TL memory handle data"); | ||
| return UCC_ERR_NO_MEMORY; | ||
| } | ||
|
|
||
| /* Store buffer information - registration will happen lazily on first use */ | ||
| m_data->address = memh->address; | ||
| m_data->length = memh->len; | ||
| m_data->registered_comms = NULL; | ||
| m_data->nccl_handles = NULL; | ||
| m_data->num_comms = 0; | ||
| m_data->max_comms = 0; | ||
|
|
||
| /* Set TL handle data */ | ||
| tl_h->tl_data = m_data; | ||
| strncpy(tl_h->tl_name, "nccl", UCC_MEM_MAP_TL_NAME_LEN - 1); | ||
| tl_h->tl_name[UCC_MEM_MAP_TL_NAME_LEN - 1] = '\0'; | ||
|
|
There was a problem hiding this comment.
IMPORT memh uses garbage
ucc_tl_nccl_mem_map claims to support UCC_MEM_MAP_MODE_IMPORT, but it ignores the packed payload in memh->pack_buffer and instead stores memh->address/memh->len into m_data. In the core import path, those fields are explicitly treated as “likely garbage” for imported handles (src/core/ucc_context.c:1212-1214), so the imported NCCL TL handle will contain invalid address/length and later lazy registration may attempt ncclCommRegister on bogus pointers.
| /* Verify that the entire buffer is within the registered memory region */ | ||
| buf_start = (uintptr_t)buffer; | ||
| buf_end = buf_start + length; | ||
| region_start = (uintptr_t)m_data->address; | ||
| region_end = region_start + m_data->length; | ||
|
|
||
| if (length > (UINTPTR_MAX - buf_start)) { | ||
| tl_error(UCC_TL_TEAM_LIB(team), "NCCL UBR: buffer size causes overflow"); | ||
| return UCC_ERR_INVALID_PARAM; | ||
| } |
There was a problem hiding this comment.
Overflow check too late
In ucc_tl_nccl_lazy_register_memh, buf_end is computed as buf_start + length before checking length <= UINTPTR_MAX - buf_start. If the addition overflows, buf_end is already wrapped/UB, and the subsequent bounds check can misbehave. The overflow guard needs to run before computing buf_end.
| /* For NCCL UBR, we only store metadata (address/length) for lazy registration. | ||
| * When ncclCommRegister is called later, it stores this metadata locally. | ||
| * The NCCL communicator handles IPC handle exchange internally during collective | ||
| * operations (via point-to-point proxy calls), so we don't need special IMPORT | ||
| * handling. We can use memh->address/memh->len directly in both EXPORT and IMPORT | ||
| * modes - the address should be valid in the current process context. */ | ||
| m_data->address = memh->address; | ||
| m_data->length = memh->len; |
There was a problem hiding this comment.
For IMPORT mode, verify whether using memh->address/memh->len directly is correct. The core code treats these as "likely garbage" for imported handles (src/core/ucc_context.c:1212-1214), suggesting TLs should extract data from pack_buffer. If NCCL's IPC model truly makes local addresses valid for IMPORT mode (different from typical TL behavior), add documentation explaining this design choice. Otherwise, implement extraction from pack_buffer similar to the core's approach in ucc_context.c:1224-1238.

What
Add support for NCCL user buffer registration via UCC mem map interface