Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions CUDATools/src/profile.jl
Original file line number Diff line number Diff line change
Expand Up @@ -934,8 +934,13 @@ function Base.show(io::IO, results::ProfileResults)
println(io, "\nDevice-side activity: GPU was busy for $(format_time(device_time)) ($(format_percentage(device_ratio)) of the trace)")
end

# add memory throughput information
device = merge(device, (; throughput=device.size ./ device.time))
# add memory throughput information. CUPTI's timestamp resolution
# can round very short events down to 0 ns, so guard against the
# resulting Inf throughput (which would later trip up format_bytes).
throughput = map(device.size, device.time) do s, t
(s === missing || !isfinite(t) || t == 0) ? missing : s / t
end
device = merge(device, (; throughput))

if isempty(device.id)
println(io, "\nNo device-side activity was recorded.")
Expand Down
13 changes: 8 additions & 5 deletions test/core/device/intrinsics/math.jl
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,21 @@ using SpecialFunctions
end
end

# NVPTX has no sub.<rnd> intrinsic; sub_<rnd>(x,y) reuses add_<rnd>(x,-y).
# For non-rn modes LLVM keeps the rounded add; for rn (the default) it
# may fold back to a plain `sub`.
# NVPTX has no `llvm.nvvm.sub.<rnd>` intrinsic, so sub_<rnd>(x,y) is
# implemented as add_<rnd>(x,-y). PTX itself does accept rounding
# modifiers on `sub`, so the backend may fold back to a real
# `sub.<rnd>.<suffix>` (LLVM 22) or keep the rounded add (older LLVM).
# For `rn` the suffix may be elided entirely.
for rnd in (:rn, :rz, :rm, :rp)
f = getfield(CUDA, Symbol(:sub_, rnd))
for (T, suffix) in ((Float32, "f32"), (Float64, "f64"))
kernel = (out, x, y) -> (out[] = f(x, y); nothing)
buf = CuArray{T}(undef, 1)
ptx = sprint(io->(@device_code_ptx io=io @cuda launch=false kernel(buf, T(1), T(1))))
accepted = rnd === :rn ?
("add.rn.$(suffix)", "add.$(suffix)", "sub.$(suffix)") :
("add.$(rnd).$(suffix)",)
("add.rn.$(suffix)", "add.$(suffix)",
"sub.rn.$(suffix)", "sub.$(suffix)") :
("add.$(rnd).$(suffix)", "sub.$(rnd).$(suffix)")
@test any(s -> occursin(s, ptx), accepted)
end
end
Expand Down