Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
f1b2b39
Update mergesort bench for multi-gpu
byeongjee Jan 16, 2024
da23d41
Print device name on gpu selection
byeongjee Jan 16, 2024
bb76ef5
Move CtxMap to dep/CtxSet
byeongjee Jan 17, 2024
7a6f8cf
Refactor using CtxSetFn
byeongjee Jan 17, 2024
1e5c807
Update HybridBasis to conform to new gpu task interface
byeongjee Jan 17, 2024
96494b6
Update raytracer bench for multi-gpu setting
byeongjee Jan 17, 2024
cdd6341
Update mandelbrot bench for multi gpu
byeongjee Jan 17, 2024
8b32f7e
Rename misc
byeongjee Jan 18, 2024
3803133
Fix bug in raytracer multi gpu benchmark
byeongjee Jan 18, 2024
05c94db
Add GpuData functor
byeongjee Jan 19, 2024
9181a5d
Update Quickhull bench for multi gpu
byeongjee Jan 19, 2024
a6badf2
Update primes bench for multi gpu
byeongjee Jan 19, 2024
de7b445
Rename variables for consistency
byeongjee Jan 19, 2024
b8473ab
Fix bugs in quickhull multi gpu implementation
byeongjee Jan 22, 2024
d692d01
Update GpuData.free to return unit
byeongjee Jan 22, 2024
1ed8697
Update CtxSet.free to return unit
byeongjee Jan 22, 2024
cd85297
Fix typing bug in HybridBasis
byeongjee Jan 22, 2024
a10d6e7
Update kmeans hybrid algorithm for multi gpu
byeongjee Jan 22, 2024
ccafcbe
Update Intervaltree benchmark for multi gpu
byeongjee Jan 22, 2024
af802d1
Let devices CLA configurable
byeongjee Jan 24, 2024
2b84e9c
Fix bug in ctx cleanup with profiling
byeongjee Jan 24, 2024
271424b
Fix Futhark context initialization message for consistency
byeongjee Jan 24, 2024
43d8075
Use CtxSet in FutMandelbrot for consistency
byeongjee Jan 24, 2024
721ceab
Format CtxSet
byeongjee Jan 24, 2024
b63dab5
Use CtxSet.getOne for gpu benchmarks
byeongjee Jan 24, 2024
5e6ea2e
Fix missing device configuration in quickhull bench
byeongjee May 19, 2024
852d129
Fix dep ordering
byeongjee May 21, 2024
6cbc910
Use store/restore to pass data between gpu for raytracer
byeongjee May 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions bench/build-intervaltree/FutSort.sml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ struct
(* NOTE: need to pass the `vals` array here, see note in HybridSort where
* FutSort.sort is called
*)
fun init (vals: Int32.int Seq.t) =
fun init (vals: Int32.int Seq.t) (device: string)=
let
val (data, start, len) = ArraySlice.base vals
val f = _import "fut_init" public : i32 array * i64 * i64 -> fut_context;
val f = _import "fut_init" public : string * i32 array * i64 * i64 -> fut_context;
in
f (data, start, len)
f (device, data, start, len)
end

fun cleanup x =
Expand Down
26 changes: 19 additions & 7 deletions bench/build-intervaltree/HybridSort.sml
Original file line number Diff line number Diff line change
Expand Up @@ -26,29 +26,41 @@ struct
fun cmpWith vals (i, j) =
Int32.compare (Seq.nth vals (Int32.toInt i), Seq.nth vals (Int32.toInt j))

fun chooseCtx ctxSet device =
let
val (device, ctx) = Seq.first
(Seq.filter (fn (d, _) => d = device) ctxSet)
val _ = print ("chosen device: " ^ device ^ "\n")
in
ctx
end

(* sort idxs where the comparison 'i < j' is defined by vals[i] < vals[j] *)
fun sort ctx (vals: Int32.int Seq.t) (idxs: Int32.int Seq.t) =
fun sort ctxSet (vals: Int32.int Seq.t) (idxs: Int32.int Seq.t) =
if Seq.length idxs <= quickThresh then
Quicksort.sort (cmpWith vals) idxs
else
let
val half = Real.ceil (split * Real.fromInt (Seq.length idxs))
val left = Seq.take idxs half
val right = Seq.drop idxs half
val (left', right') = ForkJoin.par (fn _ => sort ctx vals left, fn _ =>
sortChoose ctx vals right)
val (left', right') =
ForkJoin.par (fn _ => sort ctxSet vals left, fn _ =>
sortChoose ctxSet vals right)
in
Merge.merge (cmpWith vals) (left', right')
end

and sortChoose ctx vals idxs =
and sortChoose ctxSet vals idxs =
if Seq.length idxs >= gpuMinThresh then
ForkJoin.choice
{ prefer_cpu = fn _ => sort ctx vals idxs
{ prefer_cpu = fn _ => sort ctxSet vals idxs
(* NOTE: vals needs to be initialized on GPU beforehand... *)
, prefer_gpu = fn _ => FutSort.sort ctx (*vals*) idxs
, prefer_gpu = fn device => let val ctx = chooseCtx ctxSet device
in FutSort.sort ctx (*vals*) idxs
end
}
else
sort ctx vals idxs
sort ctxSet vals idxs

end
5 changes: 3 additions & 2 deletions bench/build-intervaltree/glue.c
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ struct fut_context
struct futhark_i32_1d *vals;
};

void *fut_init(int32_t *vals, int64_t start, int64_t len)
void *fut_init(unsigned char *device, int32_t *vals, int64_t start, int64_t len)
{
struct timer_t t;
timer_begin(&t, "fut_init");

struct futhark_context_config *cfg = futhark_context_config_new();
futhark_context_config_set_device(cfg, device);
timer_report_tick(&t, "futhark_context_config_new");

struct futhark_context *ctx = futhark_context_new(cfg);
Expand Down Expand Up @@ -68,7 +69,7 @@ struct sort_pack
int32_t *input;
int64_t start;
int64_t len;

int32_t *output;
};

Expand Down
18 changes: 12 additions & 6 deletions bench/build-intervaltree/main.sml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@ structure CLA = CommandLineArgs
val n = CLA.parseInt "n" 5000000
val impl = CLA.parseString "impl" "hybrid"
val reportSize = CLA.parseFlag "report-size"
val devices = String.fields (fn c => c = #",")
(CommandLineArgs.parseString "devices" "")

val _ = print ("n " ^ Int.toString n ^ "\n")
val _ = print ("impl " ^ impl ^ "\n")
val _ = print
("report-size? " ^ (if reportSize then "true" else "false") ^ "\n")
val _ = print ("devices " ^ String.concatWith ", " devices ^ "\n")


fun cmpWith vals (i, j) =
Expand All @@ -16,8 +19,8 @@ fun cmpWith vals (i, j) =

val sorter =
case impl of
"hybrid" => (fn c => fn v => fn i => HybridSort.sortChoose c v i)
| "cpu" => (fn c => fn v => fn i => Mergesort.sort (cmpWith v) i)
"hybrid" => (fn ctxSet => fn v => fn i => HybridSort.sortChoose ctxSet v i)
| "cpu" => (fn ctxSet => fn v => fn i => Mergesort.sort (cmpWith v) i)
| _ => Util.die ("unknown impl '" ^ impl ^ "'")


Expand Down Expand Up @@ -59,7 +62,10 @@ fun query tree seed =

val segs = Seq.tabulate (fn i => randSeg (2 * i)) n
val segs_xs = Seq.map #1 segs
val futctx = FutSort.init segs_xs

val ctxSet =
Seq.map (fn device => (device, FutSort.init segs_xs device))
(Seq.fromList devices)

val _ =
if not reportSize then
Expand Down Expand Up @@ -117,9 +123,9 @@ fun makeIntervalMap () =
Int32.compare (#1 (getSeg i), #1 (getSeg j))
val idxs' = Mergesort.sort cmp idxs *)

(* val idxs' = HybridSort.sort futctx segs_xs idxs *)
(* val idxs' = HybridSort.sort ctxSet segs_xs idxs *)

val idxs' = sorter futctx segs_xs idxs
val idxs' = sorter ctxSet segs_xs idxs
in
doitSorted idxs'
end
Expand Down Expand Up @@ -154,4 +160,4 @@ val _ =
in print ("size of tree (bytes): " ^ LargeInt.toString sz ^ "\n")
end

val _ = FutSort.cleanup futctx
val _ = Seq.map (fn (_, ctx) => FutSort.cleanup ctx) ctxSet
85 changes: 52 additions & 33 deletions bench/kmeans/main.sml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ val k = CLA.parseInt "k" 5

val profile = CommandLineArgs.parseFlag "profile"

val devices = String.fields (fn c => c = #",")
(CommandLineArgs.parseString "devices" "")

val max_iterations = CLA.parseInt "max-iterations" 10

val impl = CommandLineArgs.parseString "impl" "cpu"
Expand Down Expand Up @@ -56,17 +59,19 @@ val () = print ("K: " ^ Int.toString k ^ "\n")
val () = print ("Points: " ^ Int.toString (Points.length points) ^ "\n")
val () = print ("Max iterations: " ^ Int.toString max_iterations ^ "\n")

structure CtxSet = CtxSetFn (structure F = Futhark)
val () = print "Initialising Futhark context... "
val ctx = Futhark.Context.new
((Futhark.Config.cache (SOME "futhark.cache")
o Futhark.Config.profiling profile) Futhark.Config.default)
val ctxSet = CtxSet.fromList devices
val ctx = CtxSet.getOne ctxSet
val () = print "Done!\n"

structure FutharkPoints = GpuData(type t = Futhark.Real64Array2.array)

fun futharkPoints (points: Points.t) =
fun futharkPoints (points: Points.t) ctx =
Futhark.Real64Array2.new ctx (Points.toSeq points) (Points.length points, d)

val points_fut = futharkPoints points
val points_fut_set = FutharkPoints.initialize ctxSet (futharkPoints points)
val points_fut = FutharkPoints.choose points_fut_set "#1"

fun tt a b =
Time.fmt 4 (Time.- (b, a))
Expand All @@ -81,37 +86,51 @@ val bench =
let
fun gpuHistogram centroids =
let
val centroids_fut =
Futhark.Real64Array2.new ctx (Points.toSeq centroids)
(Points.length centroids, d)
val centroids_fut_set =
FutharkPoints.initialize ctxSet (fn ctx =>
Futhark.Real64Array2.new ctx (Points.toSeq centroids)
(Points.length centroids, d))
in
{ kernel = fn (start, stop) =>
{ kernel = fn device =>
let
val t1 = Time.now ()
val (counts_fut, hist_fut) =
Futhark.Entry.histogram ctx
(points_fut, centroids_fut, start, stop - start)
val () = Futhark.Context.sync ctx

val t2 = Time.now ()
val counts_arr = Futhark.Int64Array1.values counts_fut
val hist_arr = Futhark.Real64Array2.values hist_fut
val () = Futhark.Int64Array1.free counts_fut
val () = Futhark.Real64Array2.free hist_fut
val t3 = Time.now ()
val result = ArraySlice.full (Array.tabulate (k, fn c =>
Array.tabulate (d + 1, fn i =>
if i = 0 then Real.fromInt (Array.sub (counts_arr, c))
else Array.sub (hist_arr, c * d + (i - 1)))))
val t4 = Time.now ()
val ctx = CtxSet.choose ctxSet device
val centroids_fut =
FutharkPoints.choose centroids_fut_set device
val points_fut = FutharkPoints.choose points_fut_set device
in
(* print
("gpu kmeans (" ^ Int.toString (stop - start) ^ "): "
^ tt t1 t2 ^ "+" ^ tt t2 t3 ^ "+" ^ tt t3 t4 ^ "s\n"); *)
result
fn (start, stop) =>
let
val t1 = Time.now ()
val (counts_fut, hist_fut) =
Futhark.Entry.histogram ctx
(points_fut, centroids_fut, start, stop - start)
val () = Futhark.Context.sync ctx

val t2 = Time.now ()
val counts_arr = Futhark.Int64Array1.values counts_fut
val hist_arr = Futhark.Real64Array2.values hist_fut
val () = Futhark.Int64Array1.free counts_fut
val () = Futhark.Real64Array2.free hist_fut
val t3 = Time.now ()
val result =
ArraySlice.full (Array.tabulate (k, fn c =>
Array.tabulate (d + 1, fn i =>
if i = 0 then
Real.fromInt (Array.sub (counts_arr, c))
else
Array.sub (hist_arr, c * d + (i - 1)))))
val t4 = Time.now ()
in
(* print
("gpu kmeans (" ^ Int.toString (stop - start) ^ "): "
^ tt t1 t2 ^ "+" ^ tt t2 t3 ^ "+" ^ tt t3 t4 ^ "s\n"); *)
result
end
end

, after = fn () => Futhark.Real64Array2.free centroids_fut
, after = fn () =>
FutharkPoints.free centroids_fut_set
Futhark.Real64Array2.free
}
end
in
Expand Down Expand Up @@ -161,7 +180,7 @@ val bench =
fun centroidsChunk arg =
ForkJoin.choice
{ prefer_cpu = fn () => Kmeans.centroidsChunkCPU points arg
, prefer_gpu = fn () => centroidsChunkGPU arg
, prefer_gpu = fn device => centroidsChunkGPU arg
}
in
Kmeans.kmeansHybrid centroidsChunk k max_iterations points
Expand All @@ -181,7 +200,7 @@ val () =
if profile then (writeFile "futhark.json" (Futhark.Context.report ctx))
else ()

val () = Futhark.Context.free ctx
val () = CtxSet.free ctxSet

val () = print ("kmeans iterations: " ^ Int.toString kmeans_iters ^ "\n")
val _ =
Expand Down
40 changes: 26 additions & 14 deletions bench/kmeans/sml/Hist.sml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ structure Hist:
sig

type grain = int
type device_identifier = Device.device_identifier

(* returns the result of each bin
*
Expand Down Expand Up @@ -49,7 +50,7 @@ sig
, hi: int
, get_bin: int -> int
, modify_bin: int -> 'a -> unit
, gpu: (int * int) -> 'a Seq.t
, gpu: device_identifier -> (int * int) -> 'a Seq.t
}
-> 'a Seq.t

Expand All @@ -66,14 +67,14 @@ sig
, hi: int
, get_bin: int -> int
, modify_bin: int -> 'a -> unit
, gpu: (int * int) -> 'a Seq.t
, gpu: device_identifier -> (int * int) -> 'a Seq.t
}
-> 'a Seq.t

end =
struct

type grain = int
type device_identifier = Device.device_identifier


fun hist grain {combine, neutral, num_bins} {lo, hi, get_bin, get_elem} =
Expand Down Expand Up @@ -149,7 +150,12 @@ struct

fun inplace_hist_hybrid cpu_grain gpu_grain gpu_split
(hist_args as {combine: 'a * 'a -> 'a, fresh_neutral: unit -> 'a, num_bins})
{lo, hi, get_bin, modify_bin, gpu: int * int -> 'a Seq.t} : 'a Seq.t =
{ lo
, hi
, get_bin
, modify_bin
, gpu: device_identifier -> int * int -> 'a Seq.t
} : 'a Seq.t =
let
fun fresh_acc () =
Array.tabulate (num_bins, fn _ => fresh_neutral ())
Expand Down Expand Up @@ -183,21 +189,27 @@ struct
big_block blo bhi
end

, fn (b1, b2) =>
let
val blo = lo + b1 * gpu_grain
val bhi = Int.min (hi, lo + b2 * gpu_grain)
in
gpu (blo, bhi)
end
, fn device =>
fn (b1, b2) =>
let
val blo = lo + b1 * gpu_grain
val bhi = Int.min (hi, lo + b2 * gpu_grain)
in
gpu device (blo, bhi)
end
)
end


fun inplace_hist_hybrid_two_level cpu_grain gpu_grain gpu_split
(hist_args as
{combine_inplace: 'a * 'a -> unit, fresh_neutral: unit -> 'a, num_bins})
{lo, hi, get_bin, modify_bin, gpu: int * int -> 'a Seq.t} : 'a Seq.t =
{ lo
, hi
, get_bin
, modify_bin
, gpu: device_identifier -> int * int -> 'a Seq.t
} : 'a Seq.t =
let
fun fresh_acc () =
Seq.tabulate (fn _ => fresh_neutral ()) (num_bins)
Expand Down Expand Up @@ -253,7 +265,7 @@ struct
else
ForkJoin.choice
{ prefer_cpu = fn _ => loop start stop
, prefer_gpu = fn _ => gpu (start, stop)
, prefer_gpu = fn device => gpu device (start, stop)
}


Expand All @@ -271,7 +283,7 @@ struct
in
ForkJoin.choice
{ prefer_cpu = fn _ => loop start stop
, prefer_gpu = fn _ => gpu (start, stop)
, prefer_gpu = fn device => gpu device (start, stop)
}
end
else
Expand Down
Loading