Skip to content

reinterpret of tuple fails on gpus #712

@simeonschaub

Description

@simeonschaub

This generates gpu-incompatible code:

julia> foo(x,y) = reinterpret(UInt64, (x,y))
foo (generic function with 1 method)

julia> OpenCL.code_llvm(foo, (Float32, Float32))
;  @ REPL[3]:1 within `foo`
define i64 @julia_foo_81905(float %"x::Float32", float %"y::Float32") local_unnamed_addr {
top:
  %jlcallframe10 = alloca [9 x ptr], align 8
  %"new::Tuple" = alloca [2 x float], align 4
  store float %"x::Float32", ptr %"new::Tuple", align 4
  %0 = getelementptr inbounds i8, ptr %"new::Tuple", i64 4
  store float %"y::Float32", ptr %0, align 4
; ┌ @ essentials.jl:736 within `reinterpret`
; │┌ @ reinterpretarray.jl:857 within `_reinterpret`
    %1 = call fastcc nonnull ptr @julia_packedsize_81937()
; ││ @ reinterpretarray.jl:859 within `_reinterpret`
    store ptr %1, ptr %jlcallframe10, align 8
    %2 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 1
    store ptr inttoptr (i64 139924789764000 to ptr), ptr %2, align 8
    %3 = call nonnull ptr @ijl_apply_generic(ptr inttoptr (i64 139924866051216 to ptr), ptr nonnull %jlcallframe10, i32 2)
    %.tag_addr = getelementptr inbounds i64, ptr %3, i64 -1
    %.tag = load atomic i64, ptr %.tag_addr unordered, align 8
    %4 = and i64 %.tag, -16
    %5 = inttoptr i64 %4 to ptr
    %exactly_isa = icmp eq ptr %5, inttoptr (i64 192 to ptr)
    br i1 %exactly_isa, label %pass, label %fail

L5:                                               ; preds = %pass
; ││ @ reinterpretarray.jl:862 within `_reinterpret`
; ││┌ @ refpointer.jl:147 within `Ref`
; │││┌ @ refvalue.jl:8 within `RefValue`
      %"new::RefValue.sroa.0.0.copyload" = load i64, ptr %"new::Tuple", align 1
; ││└└
; ││ @ reinterpretarray.jl:864 within `_reinterpret`
; ││┌ @ reinterpretarray.jl:811 within `struct_subpadding`
     %6 = load ptr, ptr addrspacecast (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @jl_small_typeof, i64 320) to ptr), align 8
     store ptr %6, ptr %jlcallframe10, align 8
     %7 = call nonnull ptr @ijl_invoke(ptr inttoptr (i64 139924884548304 to ptr), ptr nonnull %jlcallframe10, i32 1, ptr inttoptr (i64 139924885003056 to ptr))
; │││┌ @ reinterpretarray.jl:755 within `padding`
      store ptr inttoptr (i64 139924922404224 to ptr), ptr %jlcallframe10, align 8
      store ptr inttoptr (i64 139924789760416 to ptr), ptr %2, align 8
      %8 = call nonnull ptr @ijl_invoke(ptr inttoptr (i64 139924884548304 to ptr), ptr nonnull %jlcallframe10, i32 2, ptr inttoptr (i64 139924884994992 to ptr))
; │││└
     %9 = call fastcc i8 @julia____81955(ptr %7, ptr %8)
; ││└
    %10 = and i8 %9, 1
    %.not = icmp eq i8 %10, 0
    br i1 %.not, label %L22, label %L31

L22:                                              ; preds = %L5
; ││ @ reinterpretarray.jl:874 within `_reinterpret`
    %11 = call fastcc i64 @julia__reinterpret_padding_81925(ptr nocapture readonly %"new::Tuple")
    br label %L31

L24:                                              ; preds = %pass
    %12 = load ptr, ptr addrspacecast (ptr addrspace(1) getelementptr inbounds (i8, ptr addrspace(1) @jl_small_typeof, i64 320) to ptr), align 8
    store ptr inttoptr (i64 139924981334912 to ptr), ptr %jlcallframe10, align 8
    store ptr %12, ptr %2, align 8
    %13 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 2
    store ptr inttoptr (i64 139924954237824 to ptr), ptr %13, align 8
    %14 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 3
    store ptr inttoptr (i64 139924922404224 to ptr), ptr %14, align 8
    %15 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 4
    store ptr inttoptr (i64 139924981334864 to ptr), ptr %15, align 8
    %16 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 5
    store ptr inttoptr (i64 139924789764000 to ptr), ptr %16, align 8
    %17 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 6
    store ptr inttoptr (i64 139924954237824 to ptr), ptr %17, align 8
    %18 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 7
    store ptr %1, ptr %18, align 8
    %19 = getelementptr inbounds ptr, ptr %jlcallframe10, i64 8
    store ptr inttoptr (i64 139924981334832 to ptr), ptr %19, align 8
    %jl_f_tuple_ret = call nonnull ptr @jl_f_tuple(ptr null, ptr nonnull %jlcallframe10, i32 9)
; ││ @ reinterpretarray.jl:859 within `_reinterpret`
    call fastcc void @gpu_report_exception()
    call fastcc void @gpu_signal_exception()
    call void @llvm.trap()
    unreachable

L31:                                              ; preds = %L22, %L5
    %value_phi = phi i64 [ %11, %L22 ], [ %"new::RefValue.sroa.0.0.copyload", %L5 ]
; │└
   ret i64 %value_phi

fail:                                             ; preds = %top
; │┌ @ reinterpretarray.jl:859 within `_reinterpret`
    call fastcc void @gpu_report_exception()
    call fastcc void @gpu_signal_exception()
    call void @llvm.trap()
    unreachable

pass:                                             ; preds = %top
    %jl_false = load ptr, ptr addrspace(1) @jl_false, align 8
    %20 = icmp eq ptr %3, %jl_false
    br i1 %20, label %L24, label %L5
; └└
}

Even though it produces good code on the CPU:

julia> code_llvm(foo, (Float32, Float32))
; Function Signature: foo(Float32, Float32)
;  @ REPL[3]:1 within `foo`
define i64 @julia_foo_85335(float %"x::Float32", float %"y::Float32") #0 {
top:
; ┌ @ essentials.jl:736 within `reinterpret`
; │┌ @ reinterpretarray.jl:862 within `_reinterpret`
; ││┌ @ refpointer.jl:147 within `Ref`
; │││┌ @ refvalue.jl:8 within `RefValue`
      %0 = bitcast float %"x::Float32" to i32
      %1 = bitcast float %"y::Float32" to i32
; ││└└
; ││ @ reinterpretarray.jl:871 within `_reinterpret`
; ││┌ @ refvalue.jl:59 within `getindex`
; │││┌ @ Base_compiler.jl:54 within `getproperty`
      %"new::RefValue3.sroa.5.0.insert.ext10" = zext i32 %1 to i64
      %"new::RefValue3.sroa.5.0.insert.shift11" = shl nuw i64 %"new::RefValue3.sroa.5.0.insert.ext10", 32
      %"new::RefValue3.sroa.0.0.insert.ext7" = zext i32 %0 to i64
      %"new::RefValue3.sroa.0.0.insert.insert9" = or disjoint i64 %"new::RefValue3.sroa.5.0.insert.shift11", %"new::RefValue3.sroa.0.0.insert.ext7"
      ret i64 %"new::RefValue3.sroa.0.0.insert.insert9"
; └└└└
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions