From 64965e894e2f9824e28e7e83db154ac15eb9268e Mon Sep 17 00:00:00 2001 From: Tim Besard Date: Fri, 10 Apr 2026 07:19:34 +0200 Subject: [PATCH] Store CLArray offset in bytes instead of elements The element-based offset was lossy when materializing reinterpret on views with non-aligned offsets (e.g., reinterpreting a view of Int32 as Int64). The byte offset would get truncated by integer division when converting to the new element count. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/array.jl | 28 ++++++++++++++++------------ src/gpuarrays.jl | 2 +- test/array.jl | 9 +++++++++ 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/src/array.jl b/src/array.jl index 8c16ba97..b424f4e2 100644 --- a/src/array.jl +++ b/src/array.jl @@ -39,7 +39,7 @@ mutable struct CLArray{T, N, M} <: AbstractGPUArray{T, N} data::DataRef{Managed{M}} maxsize::Int # maximum data size; excluding any selector bytes - offset::Int # offset of the data in memory, in number of elements + offset::Int # offset of the data in memory, in bytes dims::Dims{N} @@ -288,14 +288,14 @@ function Base.unsafe_convert(::Type{Ptr{T}}, x::CLArray{T}) where {T} if !host_accessible(x) throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) end - return convert(Ptr{T}, x.data[]) + x.offset * Base.elsize(x) + return convert(Ptr{T}, x.data[]) + x.offset end function Base.unsafe_convert(::Type{CLPtr{T}}, x::CLArray{T}) where {T} if !device_accessible(x) throw(ArgumentError("cannot take the device address of a $(typeof(x))")) end - return convert(CLPtr{T}, x.data[]) + x.offset * Base.elsize(x) + return convert(CLPtr{T}, x.data[]) + x.offset end # when passing to OpenCL kernels with `clcall`, don't convert directly to a pointer, @@ -311,7 +311,7 @@ function Base.unsafe_convert(::Type{CLDeviceArray{T, N, AS.CrossWorkgroup}}, a::CLArray{T, N}) where {T, N} return CLDeviceArray{T, N, AS.CrossWorkgroup}( size(a), reinterpret(LLVMPtr{T, AS.CrossWorkgroup}, pointer(a)), - a.maxsize - a.offset * Base.elsize(a) + a.maxsize - a.offset ) end @@ -321,8 +321,12 @@ end synchronize(x::CLArray) = synchronize(x.data[]) typetagdata(a::Array, i = 1) = ccall(:jl_array_typetagdata, Ptr{UInt8}, (Any,), a) + i - 1 -typetagdata(a::CLArray, i = 1) = - convert(CLPtr{UInt8}, a.data[]) + a.maxsize + a.offset + i - 1 +function typetagdata(a::CLArray, i = 1) + # for zero-size element types (e.g. singleton unions), the byte offset + # is always zero, so the corresponding element offset is also zero + elem_offset = iszero(Base.elsize(a)) ? 0 : a.offset ÷ Base.elsize(a) + return convert(CLPtr{UInt8}, a.data[]) + a.maxsize + elem_offset + i - 1 +end function Base.copyto!( dest::CLArray{T}, doffs::Integer, src::Array{T}, soffs::Integer, @@ -392,18 +396,18 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr else if src isa CLArray && dst isa CLArray cl.enqueue_copy(convert(cl.Buffer, dst.data[]), - (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), + dst.offset + (dst_off - 1) * sizeof(T), convert(cl.Buffer, src.data[]), - (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), + src.offset + (src_off - 1) * sizeof(T), nbytes; blocking) elseif dst isa CLArray cl.enqueue_write(convert(cl.Buffer, dst.data[]), - (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), + dst.offset + (dst_off - 1) * sizeof(T), pointer(src, src_off), nbytes; blocking) elseif src isa CLArray cl.enqueue_read(pointer(dst, dst_off), convert(cl.Buffer, src.data[]), - (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), + src.offset + (src_off - 1) * sizeof(T), nbytes; blocking) end end @@ -450,7 +454,7 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T} elseif memtype(A) <: cl.UnifiedMemory cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) else - cl.enqueue_fill(convert(cl.Buffer, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) + cl.enqueue_fill(convert(cl.Buffer, A.data[]), A.offset, convert(T, val), length(A)) end end end @@ -529,7 +533,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} elseif memtype(a) <: cl.UnifiedMemory cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) else - cl.enqueue_copy(convert(cl.Buffer, mem), 0, convert(cl.Buffer, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) + cl.enqueue_copy(convert(cl.Buffer, mem), 0, convert(cl.Buffer, a.data[]), a.offset, m*sizeof(T); blocking=false) end end end diff --git a/src/gpuarrays.jl b/src/gpuarrays.jl index 1390eedb..9c40a0c0 100644 --- a/src/gpuarrays.jl +++ b/src/gpuarrays.jl @@ -2,7 +2,7 @@ function GPUArrays.derive(::Type{T}, a::CLArray, dims::Dims{N}, offset::Int) where {T,N} ref = copy(a.data) - offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset + offset = a.offset + offset * sizeof(T) CLArray{T,N}(ref, dims; offset) end diff --git a/test/array.jl b/test/array.jl index f4ee531b..d2dfcf68 100644 --- a/test/array.jl +++ b/test/array.jl @@ -47,6 +47,15 @@ import Adapt fill!(view(xs, 2:2), 1) @test Array(xs) == [0, 1, 0] end + + @testset "reinterpret of view with non-aligned offset" begin + # reinterpreting a view to a larger element type where the byte offset + # is not a multiple of the new element size + a = CLArray(Int32[1,2,3,4,5,6,7,8,9]) + v = view(a, 2:7) # offset of 1 Int32 = 4 bytes + r = reinterpret(Int64, v) # Int64 = 8 bytes; 4 is not a multiple of 8 + @test Array(r) == reinterpret(Int64, @view Array(a)[2:7]) + end # TODO: Look into how to port the @sync if cl.USMBackend() in cl.supported_memory_backends(cl.device())