Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 16 additions & 12 deletions src/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ mutable struct CLArray{T, N, M} <: AbstractGPUArray{T, N}
data::DataRef{Managed{M}}

maxsize::Int # maximum data size; excluding any selector bytes
offset::Int # offset of the data in memory, in number of elements
offset::Int # offset of the data in memory, in bytes

dims::Dims{N}

Expand Down Expand Up @@ -288,14 +288,14 @@ function Base.unsafe_convert(::Type{Ptr{T}}, x::CLArray{T}) where {T}
if !host_accessible(x)
throw(ArgumentError("cannot take the CPU address of a $(typeof(x))"))
end
return convert(Ptr{T}, x.data[]) + x.offset * Base.elsize(x)
return convert(Ptr{T}, x.data[]) + x.offset
end

function Base.unsafe_convert(::Type{CLPtr{T}}, x::CLArray{T}) where {T}
if !device_accessible(x)
throw(ArgumentError("cannot take the device address of a $(typeof(x))"))
end
return convert(CLPtr{T}, x.data[]) + x.offset * Base.elsize(x)
return convert(CLPtr{T}, x.data[]) + x.offset
end

# when passing to OpenCL kernels with `clcall`, don't convert directly to a pointer,
Expand All @@ -311,7 +311,7 @@ function Base.unsafe_convert(::Type{CLDeviceArray{T, N, AS.CrossWorkgroup}},
a::CLArray{T, N}) where {T, N}
return CLDeviceArray{T, N, AS.CrossWorkgroup}(
size(a), reinterpret(LLVMPtr{T, AS.CrossWorkgroup}, pointer(a)),
a.maxsize - a.offset * Base.elsize(a)
a.maxsize - a.offset
)
end

Expand All @@ -321,8 +321,12 @@ end
synchronize(x::CLArray) = synchronize(x.data[])

typetagdata(a::Array, i = 1) = ccall(:jl_array_typetagdata, Ptr{UInt8}, (Any,), a) + i - 1
typetagdata(a::CLArray, i = 1) =
convert(CLPtr{UInt8}, a.data[]) + a.maxsize + a.offset + i - 1
function typetagdata(a::CLArray, i = 1)
# for zero-size element types (e.g. singleton unions), the byte offset
# is always zero, so the corresponding element offset is also zero
elem_offset = iszero(Base.elsize(a)) ? 0 : a.offset ÷ Base.elsize(a)
return convert(CLPtr{UInt8}, a.data[]) + a.maxsize + elem_offset + i - 1
end

function Base.copyto!(
dest::CLArray{T}, doffs::Integer, src::Array{T}, soffs::Integer,
Expand Down Expand Up @@ -392,18 +396,18 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr
else
if src isa CLArray && dst isa CLArray
cl.enqueue_copy(convert(cl.Buffer, dst.data[]),
(dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T),
dst.offset + (dst_off - 1) * sizeof(T),
convert(cl.Buffer, src.data[]),
(src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T),
src.offset + (src_off - 1) * sizeof(T),
nbytes; blocking)
elseif dst isa CLArray
cl.enqueue_write(convert(cl.Buffer, dst.data[]),
(dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T),
dst.offset + (dst_off - 1) * sizeof(T),
pointer(src, src_off), nbytes; blocking)
elseif src isa CLArray
cl.enqueue_read(pointer(dst, dst_off),
convert(cl.Buffer, src.data[]),
(src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T),
src.offset + (src_off - 1) * sizeof(T),
nbytes; blocking)
end
end
Expand Down Expand Up @@ -450,7 +454,7 @@ function Base.fill!(A::DenseCLArray{T}, val) where {T}
elseif memtype(A) <: cl.UnifiedMemory
cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A))
else
cl.enqueue_fill(convert(cl.Buffer, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A))
cl.enqueue_fill(convert(cl.Buffer, A.data[]), A.offset, convert(T, val), length(A))
end
end
end
Expand Down Expand Up @@ -529,7 +533,7 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T}
elseif memtype(a) <: cl.UnifiedMemory
cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false)
else
cl.enqueue_copy(convert(cl.Buffer, mem), 0, convert(cl.Buffer, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false)
cl.enqueue_copy(convert(cl.Buffer, mem), 0, convert(cl.Buffer, a.data[]), a.offset, m*sizeof(T); blocking=false)
end
end
end
Expand Down
2 changes: 1 addition & 1 deletion src/gpuarrays.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

function GPUArrays.derive(::Type{T}, a::CLArray, dims::Dims{N}, offset::Int) where {T,N}
ref = copy(a.data)
offset = (a.offset * Base.elsize(a)) ÷ sizeof(T) + offset
offset = a.offset + offset * sizeof(T)
CLArray{T,N}(ref, dims; offset)
end

Expand Down
9 changes: 9 additions & 0 deletions test/array.jl
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,15 @@ import Adapt
fill!(view(xs, 2:2), 1)
@test Array(xs) == [0, 1, 0]
end

@testset "reinterpret of view with non-aligned offset" begin
# reinterpreting a view to a larger element type where the byte offset
# is not a multiple of the new element size
a = CLArray(Int32[1,2,3,4,5,6,7,8,9])
v = view(a, 2:7) # offset of 1 Int32 = 4 bytes
r = reinterpret(Int64, v) # Int64 = 8 bytes; 4 is not a multiple of 8
@test Array(r) == reinterpret(Int64, @view Array(a)[2:7])
end
# TODO: Look into how to port the @sync

if cl.USMBackend() in cl.supported_memory_backends(cl.device())
Expand Down
Loading