diff --git a/Project.toml b/Project.toml index 868d0a9..593bfdf 100644 --- a/Project.toml +++ b/Project.toml @@ -23,8 +23,9 @@ LLVM = "9" julia = "1.10" [extras] +Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" [targets] -test = ["Pkg", "Test"] +test = ["Adapt", "Pkg", "Test"] diff --git a/test/general_routine.jl b/test/general_routine.jl index d65b009..b7a392d 100644 --- a/test/general_routine.jl +++ b/test/general_routine.jl @@ -3,4 +3,17 @@ include("tests/$TEST_BACKEND/access_fences.jl") include("tests/$TEST_BACKEND/vectorization_test.jl") include("tests/shfl.jl") -include("tests/vectorization_custom_test.jl") \ No newline at end of file +include("tests/vectorization_custom_test.jl") + +# Memory ordering tests: Enable by launching julia with 'TEST_MEMORY_ORDERING=true julia' +if get(ENV, "TEST_MEMORY_ORDERING", "false") == "true" + if Base.JLOptions().check_bounds != 0 + @warn """ + Bounds checking not set to 'auto' (current value: $(Base.JLOptions().check_bounds)) + Memory ordering tests may show inaccurate results and are skipped + Run with `TEST_MEMORY_ORDERING=true julia --project -e 'using Pkg; Pkg.test(julia_args=[\"--check-bounds=auto\"])'` + """ + else + include("tests/memory_ordering.jl") + end +end diff --git a/test/tests/litmus/load_buffer.jl b/test/tests/litmus/load_buffer.jl new file mode 100644 index 0000000..dfe9c9f --- /dev/null +++ b/test/tests/litmus/load_buffer.jl @@ -0,0 +1,251 @@ +# Litmus Test: Load Buffer (LB) +# +# Tests if loads can be buffered and re-ordered on different threads. +# +# Pattern: +# Workgroup 0 Thread 0 Workgroup 1 Thread 0 +# 0.1: let r0 = atomicLoad(y) 1.1: let r1 = atomicLoad(x) +# 0.2: atomicStore(x, 1) 1.2: atomicStore(y, 1) +# +# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/lb/load-buffer.wgsl +# +# (See end of file for complete example) + +using Adapt +using KernelAbstractions +using KernelIntrinsics +using Test + +@kernel inbounds=true function test_load_buffer( + test_locations::AbstractArray{T}, + results_r0::AbstractArray{T}, + results_r1::AbstractArray{T}, + n_pairs::T, + ::Val{RELAXED}=Val(true), + ::Val{wgXSize}=Val(256), # workgroupXSize + ::Val{test_wg}=Val(2), # testing_workgroups + ::Val{perm1}=Val(419), # permute_first + ::Val{perm2}=Val(1031), # permute_second + ::Val{stride}=Val(1) # mem_stride +) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride} + + i = @index(Global, Linear) + + if i <= n_pairs + local_invocation_id = (i - 1) % wgXSize + shuffled_workgroup = (i - 1) ÷ wgXSize + total_ids = wgXSize * test_wg + id_0 = shuffled_workgroup * wgXSize + local_invocation_id + new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg + id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize) + + # Load Buffer pattern: Load, Store, Load, Store + y_0 = (id_0) * stride * 2 + x_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1 # location_offset + x_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1 # location_offset + y_1 = (id_1) * stride * 2 + + if RELAXED + r0 = @access Relaxed test_locations[y_0 + 1] + @access Relaxed test_locations[x_0 + 1] = T(1) + r1 = @access Relaxed test_locations[x_1 + 1] + @access Relaxed test_locations[y_1 + 1] = T(1) + else + r0 = @access Acquire test_locations[y_0 + 1] + @access Release test_locations[x_0 + 1] = T(1) + r1 = @access Acquire test_locations[x_1 + 1] + @access Release test_locations[y_1 + 1] = T(1) + end + + results_r1[id_1 + 1] = r1 + results_r0[id_0 + 1] = r0 + end +end + +function run_test_load_buffer(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false) + if VERBOSE + println("\n" * "-" ^ 60) + if RELAXED + println("Litmus Test: Load Buffer (@access Relaxed)\n") + else + println("Litmus Test: Load Buffer (@access Acquire/Release)\n") + end + end + + test_locations = adapt(backend, zeros(Int32, 2048)) + results_r0 = adapt(backend, zeros(Int32, n_pairs)) + results_r1 = adapt(backend, zeros(Int32, n_pairs)) + + total_seq0 = 0 + total_seq1 = 0 + total_interleaved = 0 + total_weak = 0 + + for iter in 1:n_iterations + fill!(test_locations, Int32(0)) + fill!(results_r0, Int32(0)) + fill!(results_r1, Int32(0)) + + test_load_buffer(backend)( + test_locations, results_r0, results_r1, + Int32(n_pairs), Val(RELAXED); + ndrange=n_pairs, workgroupsize=256 + ) + KernelAbstractions.synchronize(backend) + + for i in 1:n_pairs + # Calculate id_0 to read results (WGSL reads both r0 and r1 from id_0) + workgroupXSize = 256 + testing_workgroups = 2 + local_invocation_id = (i - 1) % workgroupXSize + shuffled_workgroup = (i - 1) ÷ workgroupXSize + id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id + + r0 = Array(results_r0)[id_0 + 1] + r1 = Array(results_r1)[id_0 + 1] + + # Load Buffer outcome classification (GPUHarbor categories) + if r0 == Int32(1) && r1 == Int32(0) + total_seq0 += 1 + elseif r0 == Int32(0) && r1 == Int32(1) + total_seq1 += 1 + elseif r0 == Int32(0) && r1 == Int32(0) + total_interleaved += 1 + elseif r0 == Int32(1) && r1 == Int32(1) + total_weak += 1 + end + end + end + + total = total_seq0 + total_seq1 + total_interleaved + total_weak + + if VERBOSE + println(" ╔═══════════════════════════════════════════════════╗") + println(" ║ RESULTS ($total total tests) ║") + println(" ╠═══════════════════════════════════════════════════╣") + println(" ║ r0=1, r1=0: (seq0) $(lpad(total_seq0, 10)) ($(lpad(round(100*total_seq0/total, digits=2), 5))%) ║") + println(" ║ r0=0, r1=1: (seq1) $(lpad(total_seq1, 10)) ($(lpad(round(100*total_seq1/total, digits=2), 5))%) ║") + println(" ║ r0=0, r1=0: (interleaved) $(lpad(total_interleaved, 10)) ($(lpad(round(100*total_interleaved/total, digits=2), 5))%) ║") + println(" ║ r0=1, r1=1: (WEAK/LB) $(lpad(total_weak, 10)) ($(lpad(round(100*total_weak/total, digits=2), 5))%) ║") + println(" ╚═══════════════════════════════════════════════════╝") + end + return (total_seq0, total_seq1, total_interleaved, total_weak, total) +end + +""" +As illustration we will represent the different memory operations on threads 0 and 256. + +Here are the different memory locations calculated as per the kernel. + +┌─────┬──────┬──────┬──────┬──────┬──────┬──────┐ +│ i │ id_0 │ id_1 │ y_0 │ x_0 │ x_1 │ y_1 │ +├─────┼──────┼──────┼──────┼──────┼──────┼──────┤ +│ 1 │ 0 │ 256 │ 0 │ 1 │ 513 │ 512 │ +├─────┼──────┼──────┼──────┼──────┼──────┼──────┤ +│ 257 │ 256 │ 0 │ 512 │ 513 │ 1 │ 0 │ +└─────┴──────┴──────┴──────┴──────┴──────┴──────┘ + +We reprensent the different addresses using boxes using : + Memory addresses in test_locations : 0 1 512 513 + Memory addresses in results_r0 : 0 256 + Memory addresses in results_r1 : 0 256 + +We will illustrate the case where all operations on thread 0 happen before thread 256 and are ordered + +Step 1: Thread (i=1) loads r0 from location y_0=0 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 0 │ 0 │ 0 │ +└─────┴─────┴─────┴─────┘ + ↓ + r_0 + +Step 2: Thread (i=1) stores 1 to location x_0=1 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 1 │ 0 │ 0 │ +└─────┴─────┴─────┴─────┘ + ↑ + 1 + +Step 3: Thread (i=1) loads r1 from location x_1=513 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 1 │ 0 │ 0 │ +└─────┴─────┴─────┴─────┘ + ↓ + r_1 + +Step 4: Thread (i=1) stores 1 to location y_1=512 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 1 │ 1 │ 0 │ +└─────┴─────┴─────┴─────┘ + ↑ + 1 + +Step 5: Thread (i=1) stores r1 to location id_1=256 of results_r1 +┌─────┬─────┐ +│ 0 │ 0 │ +└─────┴─────┘ + ↑ + r_1 + +Step 6: Thread (i=1) stores r0 to location id_0=0 of results_r0 +┌─────┬─────┐ +│ 0 │ 0 │ +└─────┴─────┘ + ↑ + r_0 + +Step 7: Thread (i=257) loads r0 from location y_0=512 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 1 │ 1 │ 0 │ +└─────┴─────┴─────┴─────┘ + ↓ + r_0 + +Step 8: Thread (i=257) stores 1 to location x_0=513 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 1 │ 1 │ 1 │ +└─────┴─────┴─────┴─────┘ + ↑ + 1 + +Step 9: Thread (i=257) loads r1 from location x_1=1 +┌─────┬─────┬─────┬─────┐ +│ 0 │ 1 │ 1 │ 1 │ +└─────┴─────┴─────┴─────┘ + ↓ + r_1 + +Step 10: Thread (i=257) stores 1 to location y_1=0 +┌─────┬─────┬─────┬─────┐ +│ 1 │ 1 │ 1 │ 1 │ +└─────┴─────┴─────┴─────┘ + ↑ + 1 + +Step 11: Thread (i=257) stores r1 to location id_1=0 of results_r1 +┌─────┬─────┐ +│ 1 │ 0 │ +└─────┴─────┘ + ↑ + r_1 + +Step 12: Thread (i=257) stores r0 to location id_0=256 of results_r0 +┌─────┬─────┐ +│ 0 │ 1 │ +└─────┴─────┘ + ↑ + r_0 + +Step 13: results_r0 and results_r1 are read at location id_0=0 for categorization + r_0_test = 0 + r_1_test = 1 + +This is categorized as 'sequential' + + +Here, if Step 10 happened before Step 1 (reordering) then + r_0_test = 1 + r_1_test = 1 + +This is categorized as 'weak'. +""" diff --git a/test/tests/litmus/message_passing.jl b/test/tests/litmus/message_passing.jl new file mode 100644 index 0000000..273aeb8 --- /dev/null +++ b/test/tests/litmus/message_passing.jl @@ -0,0 +1,131 @@ +# Litmus Test: Message Passing (MP) +# +# Tests if two stores in one thread can be re-ordered according to loads on a second thread. +# +# Pattern: +# Workgroup 0 Thread 0 Workgroup 1 Thread 0 +# 0.1: atomicStore(x, 1) 1.1: let r0 = atomicLoad(y) +# 0.2: atomicStore(y, 1) 1.2: let r1 = atomicLoad(x) +# +# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/mp/message-passing.wgsl + +using Adapt +using KernelAbstractions +using KernelIntrinsics +using Test + +@kernel inbounds=true function test_message_passing( + test_locations::AbstractArray{T}, + results_r0::AbstractArray{T}, + results_r1::AbstractArray{T}, + n_pairs::T, + ::Val{RELAXED}=Val(true), + ::Val{wgXSize}=Val(256), + ::Val{test_wg}=Val(2), + ::Val{perm1}=Val(419), + ::Val{perm2}=Val(1031), + ::Val{stride}=Val(1) +) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride} + + i = @index(Global, Linear) + + if i <= n_pairs + local_invocation_id = (i - 1) % wgXSize + shuffled_workgroup = (i - 1) ÷ wgXSize + total_ids = wgXSize * test_wg + id_0 = shuffled_workgroup * wgXSize + local_invocation_id + new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg + id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize) + + # Message Passing pattern: Store, Store, Load, Load + x_0 = (id_0) * stride * 2 + y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1 # location_offset + x_1 = (id_1) * stride * 2 + y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1 # location_offset + + if RELAXED + @access Relaxed test_locations[x_0 + 1] = T(1) + @access Relaxed test_locations[y_0 + 1] = T(1) + r0 = @access Relaxed test_locations[y_1 + 1] + r1 = @access Relaxed test_locations[x_1 + 1] + else + @access Release test_locations[x_0 + 1] = T(1) + @access Release test_locations[y_0 + 1] = T(1) + r0 = @access Acquire test_locations[y_1 + 1] + r1 = @access Acquire test_locations[x_1 + 1] + end + + results_r1[id_1 + 1] = r1 + results_r0[id_1 + 1] = r0 + end +end + +function run_test_message_passing(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false) + if VERBOSE + println("\n" * "-" ^ 60) + if RELAXED + println("Litmus Test: Message Passing (@access Relaxed)\n") + else + println("Litmus Test: Message Passing (@access Acquire/Release)\n") + end + end + + test_locations = adapt(backend, zeros(Int32, 2048)) + results_r0 = adapt(backend, zeros(Int32, n_pairs)) + results_r1 = adapt(backend, zeros(Int32, n_pairs)) + + total_r0_0_r1_0 = 0 + total_r0_0_r1_1 = 0 + total_r0_1_r1_0 = 0 + total_r0_1_r1_1 = 0 + + for iter in 1:n_iterations + fill!(test_locations, Int32(0)) + fill!(results_r0, Int32(0)) + fill!(results_r1, Int32(0)) + + test_message_passing(backend)( + test_locations, results_r0, results_r1, + Int32(n_pairs), Val(RELAXED); + ndrange=n_pairs, workgroupsize=256 + ) + KernelAbstractions.synchronize(backend) + + for i in 1:n_pairs + # Calculate id_0 to read results (WGSL reads from id_0) + workgroupXSize = 256 + testing_workgroups = 2 + local_invocation_id = (i - 1) % workgroupXSize + shuffled_workgroup = (i - 1) ÷ workgroupXSize + id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id + + r0 = Array(results_r0)[id_0 + 1] + r1 = Array(results_r1)[id_0 + 1] + + # Message Passing outcome classification (GPUHarbor categories) + if r0 == Int32(0) && r1 == Int32(0) + total_r0_0_r1_0 += 1 + elseif r0 == Int32(0) && r1 == Int32(1) + total_r0_0_r1_1 += 1 + elseif r0 == Int32(1) && r1 == Int32(0) + total_r0_1_r1_0 += 1 + elseif r0 == Int32(1) && r1 == Int32(1) + total_r0_1_r1_1 += 1 + end + end + end + + total = total_r0_0_r1_0 + total_r0_0_r1_1 + total_r0_1_r1_0 + total_r0_1_r1_1 + + if VERBOSE + println(" ╔═══════════════════════════════════════════════════╗") + println(" ║ RESULTS ($total total tests) ║") + println(" ╠═══════════════════════════════════════════════════╣") + println(" ║ r0=0, r1=0: (sequential) $(lpad(total_r0_0_r1_0, 10)) ($(lpad(round(100*total_r0_0_r1_0/total, digits=2), 5))%) ║") + println(" ║ r0=1, r1=1: (sequential) $(lpad(total_r0_1_r1_1, 10)) ($(lpad(round(100*total_r0_1_r1_1/total, digits=2), 5))%) ║") + println(" ║ r0=0, r1=1: (interleaved) $(lpad(total_r0_0_r1_1, 10)) ($(lpad(round(100*total_r0_0_r1_1/total, digits=2), 5))%) ║") + println(" ║ r0=1, r1=0: (WEAK) $(lpad(total_r0_1_r1_0, 10)) ($(lpad(round(100*total_r0_1_r1_0/total, digits=2), 5))%) ║") + println(" ╚═══════════════════════════════════════════════════╝") + end + return (total_r0_0_r1_0, total_r0_0_r1_1, total_r0_1_r1_0, total_r0_1_r1_1, total) +end \ No newline at end of file diff --git a/test/tests/litmus/read.jl b/test/tests/litmus/read.jl new file mode 100644 index 0000000..ed937c5 --- /dev/null +++ b/test/tests/litmus/read.jl @@ -0,0 +1,136 @@ +# Litmus Test: Read +# +# Tests if two stores in one thread can be re-ordered according to a store and a load on a second thread +# +# Pattern: +# Workgroup 0 Thread 0 Workgroup 1 Thread 0 +# 0.1: atomicStore(x, 1) 1.1: atomicStore(y, 2) +# 0.2: atomicStore(y, 1) 1.2: let r0 = atomicLoad(x) +# +# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/read/read.wgsl + +using Adapt +using KernelAbstractions +using KernelIntrinsics +using Test + +@kernel inbounds=true function test_read( + test_locations::AbstractArray{T}, + results_r0::AbstractArray{T}, + n_pairs::T, + ::Val{RELAXED}=Val(true), + ::Val{wgXSize}=Val(256), + ::Val{test_wg}=Val(2), + ::Val{perm1}=Val(419), + ::Val{perm2}=Val(1031), + ::Val{stride}=Val(1) +) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride} + + i = @index(Global, Linear) + + if i <= n_pairs + local_invocation_id = (i - 1) % wgXSize + shuffled_workgroup = (i - 1) ÷ wgXSize + total_ids = wgXSize * test_wg + id_0 = shuffled_workgroup * wgXSize + local_invocation_id + new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg + id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize) + + # Read pattern: Store, Store, Store, Load + x_0 = (id_0) * stride * 2 + y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1 + y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1 + x_1 = (id_1) * stride * 2 + + if RELAXED + @access Relaxed test_locations[x_0 + 1] = T(1) + @access Relaxed test_locations[y_0 + 1] = T(1) + @access Relaxed test_locations[y_1 + 1] = T(2) + r0 = @access Relaxed test_locations[x_1 + 1] + else + @access Release test_locations[x_0 + 1] = T(1) + @access Release test_locations[y_0 + 1] = T(1) + @access Release test_locations[y_1 + 1] = T(2) + r0 = @access Acquire test_locations[x_1 + 1] + end + + results_r0[id_1 + 1] = r0 + end +end + +function run_test_read(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false) + if VERBOSE + println("\n" * "-" ^ 60) + if RELAXED + println("Litmus Test: Read (@access Relaxed)\n") + else + println("Litmus Test: Read (@access Acquire/Release)\n") + end + end + + test_locations = adapt(backend, zeros(Int32, 2048)) + results_r0 = adapt(backend, zeros(Int32, n_pairs)) + + r0_1_y_2 = 0 + r0_0_y_1 = 0 + r0_1_y_1 = 0 + r0_0_y_2 = 0 + + for iter in 1:n_iterations + fill!(test_locations, Int32(0)) + fill!(results_r0, Int32(0)) + + test_read(backend)( + test_locations, results_r0, + Int32(n_pairs), Val(RELAXED); + ndrange=n_pairs, workgroupsize=256 + ) + KernelAbstractions.synchronize(backend) + + test_cpu = Array(test_locations) + + for i in 1:n_pairs + # Calculate id_0 to read results (WGSL reads r0 from id_0) + workgroupXSize = 256 + testing_workgroups = 2 + permute_second = 1031 + local_invocation_id = (i - 1) % workgroupXSize + shuffled_workgroup = (i - 1) ÷ workgroupXSize + total_ids = workgroupXSize * testing_workgroups + id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id + + r0 = Array(results_r0)[id_0 + 1] + + # Calculate y_0 address to read final value + y_0_addr = ((id_0 * permute_second) % total_ids) * 1 * 2 + 1 + 1 + + # Read the actual final value at y_0 (GPUHarbor's *y) + y_final = test_cpu[y_0_addr] + + # Categorize based on both r0 and final y_0 value (GPUHarbor categories) + if r0 == Int32(1) && y_final == Int32(2) + r0_1_y_2 += 1 + elseif r0 == Int32(0) && y_final == Int32(1) + r0_0_y_1 += 1 + elseif r0 == Int32(1) && y_final == Int32(1) + r0_1_y_1 += 1 + elseif r0 == Int32(0) && y_final == Int32(2) + r0_0_y_2 += 1 + end + end + end + + total = r0_1_y_2 + r0_0_y_1 + r0_1_y_1 + r0_0_y_2 + + if VERBOSE + println(" ╔═══════════════════════════════════════════════════╗") + println(" ║ RESULTS ($total total tests) ║") + println(" ╠═══════════════════════════════════════════════════╣") + println(" ║ r0=1, y=2: (sequential) $(lpad(r0_1_y_2, 10)) ($(lpad(round(100*r0_1_y_2/total, digits=2), 5))%) ║") + println(" ║ r0=0, y=1: (sequential) $(lpad(r0_0_y_1, 10)) ($(lpad(round(100*r0_0_y_1/total, digits=2), 5))%) ║") + println(" ║ r0=1, y=1: (interleaved) $(lpad(r0_1_y_1, 10)) ($(lpad(round(100*r0_1_y_1/total, digits=2), 5))%) ║") + println(" ║ r0=0, y=2: (WEAK) $(lpad(r0_0_y_2, 10)) ($(lpad(round(100*r0_0_y_2/total, digits=2), 5))%) ║") + println(" ╚═══════════════════════════════════════════════════╝") + end + return (r0_1_y_2, r0_0_y_1, r0_1_y_1, r0_0_y_2, total) +end diff --git a/test/tests/litmus/store.jl b/test/tests/litmus/store.jl new file mode 100644 index 0000000..82f3c9a --- /dev/null +++ b/test/tests/litmus/store.jl @@ -0,0 +1,135 @@ +# Litmus Test: Store +# +# Tests if two stores in one thread can be re-ordered according to a +# store and a load on a second thread. +# +# Pattern: +# Workgroup 0 Thread 0 Workgroup 1 Thread 0 +# 0.1: atomicStore(x, 2) 1.1: let r0 = atomicLoad(y) +# 0.2: atomicStore(y, 1) 1.2: atomicStore(x, 1) +# +# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/store/store.wgsl + +using Adapt +using KernelAbstractions +using KernelIntrinsics +using Test + +@kernel inbounds=true function test_store( + test_locations::AbstractArray{T}, + results_r0::AbstractArray{T}, + n_pairs::T, + ::Val{RELAXED}=Val(true), + ::Val{wgXSize}=Val(256), # workgroupXSize + ::Val{test_wg}=Val(2), # testing_workgroups + ::Val{perm1}=Val(419), # permute_first + ::Val{perm2}=Val(1031), # permute_second + ::Val{stride}=Val(1) # mem_stride +) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride} + + i = @index(Global, Linear) + + if i <= n_pairs + local_invocation_id = (i - 1) % wgXSize + shuffled_workgroup = (i - 1) ÷ wgXSize + total_ids = wgXSize * test_wg + id_0 = shuffled_workgroup * wgXSize + local_invocation_id + new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg + id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize) + + # Store pattern: Store, Store, Load, Store + x_0 = (id_0) * stride * 2 + y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1 + y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1 + x_1 = (id_1) * stride * 2 + + if RELAXED + @access Relaxed test_locations[x_0 + 1] = T(2) + @access Relaxed test_locations[y_0 + 1] = T(1) + r0 = @access Relaxed test_locations[y_1 + 1] + @access Relaxed test_locations[x_1 + 1] = T(1) + else + @access Release test_locations[x_0 + 1] = T(2) + @access Release test_locations[y_0 + 1] = T(1) + r0 = @access Acquire test_locations[y_1 + 1] + @access Release test_locations[x_1 + 1] = T(1) + end + + results_r0[id_1 + 1] = r0 + end +end + +function run_test_store(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false) + if VERBOSE + println("\n" * "-" ^ 60) + if RELAXED + println("Litmus Test: Store (@access Relaxed)\n") + else + println("Litmus Test: Store (@access Acquire/Release)\n") + end + end + + test_locations = adapt(backend, zeros(Int32, 2048)) + results_r0 = adapt(backend, zeros(Int32, n_pairs)) + + r0_1_x_1 = 0 + r0_0_x_2 = 0 + r0_0_x_1 = 0 + r0_1_x_2 = 0 + + for iter in 1:n_iterations + fill!(test_locations, Int32(0)) + fill!(results_r0, Int32(0)) + + test_store(backend)( + test_locations, results_r0, + Int32(n_pairs), Val(RELAXED); + ndrange=n_pairs, workgroupsize=256 + ) + KernelAbstractions.synchronize(backend) + + test_cpu = Array(test_locations) + + for i in 1:n_pairs + # Calculate id_0 to read results (WGSL reads r0 from id_0) + workgroupXSize = 256 + testing_workgroups = 2 + local_invocation_id = (i - 1) % workgroupXSize + shuffled_workgroup = (i - 1) ÷ workgroupXSize + id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id + + r0 = Array(results_r0)[id_0 + 1] + + # Calculate x_0 address to read final value (GPUHarbor reads from x_0, not x_1!) + x_0_addr = id_0 * 1 * 2 + 1 + + # Read the actual final value at x_0 (this is what GPUHarbor calls *x) + x_final = test_cpu[x_0_addr] + + # Categorize based on both r0 and final x_0 value (GPUHarbor categories) + if r0 == Int32(1) && x_final == Int32(1) + r0_1_x_1 += 1 + elseif r0 == Int32(0) && x_final == Int32(2) + r0_0_x_2 += 1 + elseif r0 == Int32(0) && x_final == Int32(1) + r0_0_x_1 += 1 + elseif r0 == Int32(1) && x_final == Int32(2) + r0_1_x_2 += 1 + end + end + end + + total = r0_1_x_1 + r0_0_x_2 + r0_0_x_1 + r0_1_x_2 + + if VERBOSE + println(" ╔═══════════════════════════════════════════════════╗") + println(" ║ RESULTS ($total total tests) ║") + println(" ╠═══════════════════════════════════════════════════╣") + println(" ║ r0=1, x=1: (sequential) $(lpad(r0_1_x_1, 10)) ($(lpad(round(100*r0_1_x_1/total, digits=2), 5))%) ║") + println(" ║ r0=0, x=2: (sequential) $(lpad(r0_0_x_2, 10)) ($(lpad(round(100*r0_0_x_2/total, digits=2), 5))%) ║") + println(" ║ r0=0, x=1: (interleaved) $(lpad(r0_0_x_1, 10)) ($(lpad(round(100*r0_0_x_1/total, digits=2), 5))%) ║") + println(" ║ r0=1, x=2: (WEAK) $(lpad(r0_1_x_2, 10)) ($(lpad(round(100*r0_1_x_2/total, digits=2), 5))%) ║") + println(" ╚═══════════════════════════════════════════════════╝") + end + return (r0_1_x_1, r0_0_x_2, r0_0_x_1, r0_1_x_2, total) +end diff --git a/test/tests/litmus/store_buffer.jl b/test/tests/litmus/store_buffer.jl new file mode 100644 index 0000000..8f42d76 --- /dev/null +++ b/test/tests/litmus/store_buffer.jl @@ -0,0 +1,132 @@ +# Litmus Test: Store Buffer (SB) +# +# Tests if stores can be buffered and re-ordered on different threads. +# /!\ A release/acquire barrier is not enough to disallow this behavior. +# +# Pattern: +# Workgroup 0 Thread 0 Workgroup 1 Thread 0 +# 0.1: atomicStore(x, 1) 1.1: atomicStore(y, 1) +# 0.2: let r0 = atomicLoad(y) 1.2: let r1 = atomicLoad(x) +# +# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/sb/store-buffer.wgsl + +using Adapt +using KernelAbstractions +using KernelIntrinsics +using Test + +@kernel inbounds=true function test_store_buffer( + test_locations::AbstractArray{T}, + results_r0::AbstractArray{T}, + results_r1::AbstractArray{T}, + n_pairs::T, + ::Val{RELAXED}=Val(true), + ::Val{wgXSize}=Val(256), + ::Val{test_wg}=Val(2), + ::Val{perm1}=Val(419), + ::Val{perm2}=Val(1031), + ::Val{stride}=Val(1) +) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride} + + i = @index(Global, Linear) + + if i <= n_pairs + local_invocation_id = (i - 1) % wgXSize + shuffled_workgroup = (i - 1) ÷ wgXSize + total_ids = wgXSize * test_wg + id_0 = shuffled_workgroup * wgXSize + local_invocation_id + new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg + id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize) + + # Store Buffer pattern: Store, Load, Store, Load + x_0 = (id_0) * stride * 2 + y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1 + y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1 + x_1 = (id_1) * stride * 2 + + if RELAXED + @access Relaxed test_locations[x_0 + 1] = T(1) + r0 = @access Relaxed test_locations[y_0 + 1] + @access Relaxed test_locations[y_1 + 1] = T(1) + r1 = @access Relaxed test_locations[x_1 + 1] + else + @access Release test_locations[x_0 + 1] = T(1) + r0 = @access Acquire test_locations[y_0 + 1] + @access Release test_locations[y_1 + 1] = T(1) + r1 = @access Acquire test_locations[x_1 + 1] + end + + results_r1[id_1 + 1] = r1 + results_r0[id_0 + 1] = r0 + end +end + +function run_test_store_buffer(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=true) + if VERBOSE + println("\n" * "-" ^ 60) + if RELAXED + println("Litmus Test: Store Buffer (@access Relaxed)\n") + else + println("Litmus Test: Store Buffer (@access Acquire/Release)\n") + end + end + + test_locations = adapt(backend, zeros(Int32, 2048)) + results_r0 = adapt(backend, zeros(Int32, n_pairs)) + results_r1 = adapt(backend, zeros(Int32, n_pairs)) + + total_seq0 = 0 + total_seq1 = 0 + total_interleaved = 0 + total_weak = 0 + + for iter in 1:n_iterations + fill!(test_locations, Int32(0)) + fill!(results_r0, Int32(0)) + fill!(results_r1, Int32(0)) + + test_store_buffer(backend)( + test_locations, results_r0, results_r1, + Int32(n_pairs), Val(RELAXED); + ndrange=n_pairs, workgroupsize=256 + ) + KernelAbstractions.synchronize(backend) + + for i in 1:n_pairs + # Calculate id_0 to read results (WGSL reads both r0 and r1 from id_0) + workgroupXSize = 256 + testing_workgroups = 2 + local_invocation_id = (i - 1) % workgroupXSize + shuffled_workgroup = (i - 1) ÷ workgroupXSize + id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id + + r0 = Array(results_r0)[id_0 + 1] + r1 = Array(results_r1)[id_0 + 1] + + # Store Buffer outcome classification (GPUHarbor categories) + if r0 == Int32(1) && r1 == Int32(0) + total_seq0 += 1 + elseif r0 == Int32(0) && r1 == Int32(1) + total_seq1 += 1 + elseif r0 == Int32(1) && r1 == Int32(1) + total_interleaved += 1 + elseif r0 == Int32(0) && r1 == Int32(0) + total_weak += 1 + end + end + end + + total = total_seq0 + total_seq1 + total_interleaved + total_weak + + if VERBOSE + println(" ╔═══════════════════════════════════════════════════╗") + println(" ║ RESULTS ($total total tests) ║") + println(" ╠═══════════════════════════════════════════════════╣") + println(" ║ r0=1, r1=0: (seq0) $(lpad(total_seq0, 10)) ($(lpad(round(100*total_seq0/total, digits=2), 5))%) ║") + println(" ║ r0=0, r1=1: (seq1) $(lpad(total_seq1, 10)) ($(lpad(round(100*total_seq1/total, digits=2), 5))%) ║") + println(" ║ r0=1, r1=1: (interleaved) $(lpad(total_interleaved, 10)) ($(lpad(round(100*total_interleaved/total, digits=2), 5))%) ║") + println(" ║ r0=0, r1=0: (WEAK/SB) $(lpad(total_weak, 10)) ($(lpad(round(100*total_weak/total, digits=2), 5))%) ║") + println(" ╚═══════════════════════════════════════════════════╝") + end + return (total_seq0, total_seq1, total_interleaved, total_weak, total) +end diff --git a/test/tests/litmus/write_2plus2w.jl b/test/tests/litmus/write_2plus2w.jl new file mode 100644 index 0000000..f641d18 --- /dev/null +++ b/test/tests/litmus/write_2plus2w.jl @@ -0,0 +1,130 @@ +# Litmus Test: 2+2W (Write) +# +# Tests if two stores in two threads can both be re-ordered. +# +# Pattern: +# Workgroup 0 Thread 0 Workgroup 1 Thread 0 +# 0.1: atomicStore(x, 2) 1.1: atomicStore(y, 2) +# 0.2: atomicStore(y, 1) 1.2: atomicStore(x, 1) +# +# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/2+2/2+2-write.wgsl + +using Adapt +using KernelAbstractions +using KernelIntrinsics +using Test + +@kernel inbounds=true function test_write_2plus2w( + test_locations::AbstractArray{T}, + n_pairs::T, + ::Val{RELAXED}=Val(true), + ::Val{wgXSize}=Val(256), + ::Val{test_wg}=Val(2), + ::Val{perm1}=Val(419), + ::Val{perm2}=Val(1031), + ::Val{stride}=Val(1) +) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride} + + i = @index(Global, Linear) + + if i <= n_pairs + local_invocation_id = (i - 1) % wgXSize + shuffled_workgroup = (i - 1) ÷ wgXSize + total_ids = wgXSize * test_wg + id_0 = shuffled_workgroup * wgXSize + local_invocation_id + new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg + id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize) + + # 2+2W pattern: Store, Store, Store, Store (no loads) + x_0 = (id_0) * stride * 2 + y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1 + y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1 + x_1 = (id_1) * stride * 2 + + if RELAXED + @access Relaxed test_locations[x_0 + 1] = T(2) + @access Relaxed test_locations[y_0 + 1] = T(1) + @access Relaxed test_locations[y_1 + 1] = T(2) + @access Relaxed test_locations[x_1 + 1] = T(1) + else + @access Release test_locations[x_0 + 1] = T(2) + @access Release test_locations[y_0 + 1] = T(1) + @access Release test_locations[y_1 + 1] = T(2) + @access Release test_locations[x_1 + 1] = T(1) + end + end +end + +function run_test_2plus2w(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=true) + if VERBOSE + println("\n" * "-" ^ 60) + if RELAXED + println("Litmus Test: 2+2W Write (@access Relaxed)\n") + else + println("Litmus Test: 2+2W Write (@access Release)\n") + end + end + + test_locations = adapt(backend, zeros(Int32, 2048)) + + x1_y2 = 0 + x2_y1 = 0 + x1_y1 = 0 + x2_y2 = 0 + + for iter in 1:n_iterations + fill!(test_locations, Int32(0)) + + test_write_2plus2w(backend)( + test_locations, + Int32(n_pairs), Val(RELAXED); + ndrange=n_pairs, workgroupsize=256 + ) + KernelAbstractions.synchronize(backend) + + test_cpu = Array(test_locations) + + for i in 1:n_pairs + # Calculate id_0 to find x_0 and y_0 addresses + workgroupXSize = 256 + testing_workgroups = 2 + permute_second = 1031 + local_invocation_id = (i - 1) % workgroupXSize + shuffled_workgroup = (i - 1) ÷ workgroupXSize + total_ids = workgroupXSize * testing_workgroups + id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id + + x_0_addr = id_0 * 1 * 2 + 1 + y_0_addr = ((id_0 * permute_second) % total_ids) * 1 * 2 + 1 + 1 + + # Read the actual final values at x_0 and y_0 (GPUHarbor's *x and *y) + mem_x_0 = test_cpu[x_0_addr] + mem_y_0 = test_cpu[y_0_addr] + + # Categorize based on (mem_x_0, mem_y_0) pairs (GPUHarbor categories) + if mem_x_0 == Int32(1) && mem_y_0 == Int32(2) + x1_y2 += 1 + elseif mem_x_0 == Int32(2) && mem_y_0 == Int32(1) + x2_y1 += 1 + elseif mem_x_0 == Int32(1) && mem_y_0 == Int32(1) + x1_y1 += 1 + else + x2_y2 += 1 + end + end + end + + total = x1_y2 + x2_y1 + x1_y1 + x2_y2 + + if VERBOSE + println(" ╔═══════════════════════════════════════════════════╗") + println(" ║ RESULTS ($total iterations) ║") + println(" ╠═══════════════════════════════════════════════════╣") + println(" ║ x=1, y=2: (sequential) $(lpad(x1_y2, 10)) ($(lpad(round(100*x1_y2/total, digits=2), 5))%) ║") + println(" ║ x=2, y=1: (sequential) $(lpad(x2_y1, 10)) ($(lpad(round(100*x2_y1/total, digits=2), 5))%) ║") + println(" ║ x=1, y=1: (interleaved) $(lpad(x1_y1, 10)) ($(lpad(round(100*x1_y1/total, digits=2), 5))%) ║") + println(" ║ x=2, y=2: (weak) $(lpad(x2_y2, 10)) ($(lpad(round(100*x2_y2/total, digits=2), 5))%) ║") + println(" ╚═══════════════════════════════════════════════════╝") + end + return (x1_y2, x2_y1, x1_y1, x2_y2, total) +end diff --git a/test/tests/memory_ordering.jl b/test/tests/memory_ordering.jl new file mode 100644 index 0000000..4fe7ce6 --- /dev/null +++ b/test/tests/memory_ordering.jl @@ -0,0 +1,70 @@ +using Test + +# Memory ordering test verbosity: Enable with VERBOSE_MEMORY_ORDERING=true +const VERBOSE = get(ENV, "VERBOSE_MEMORY_ORDERING", "false") == "true" + +include("litmus/message_passing.jl") +include("litmus/store.jl") +include("litmus/read.jl") +include("litmus/load_buffer.jl") +include("litmus/store_buffer.jl") +include("litmus/write_2plus2w.jl") + +@testset "Memory Ordering Litmus Tests" begin + @testset "Message Passing" begin + _, _, weak_relaxed, _, _ = run_test_message_passing( + backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE) + _, _, weak_strong, _, _ = run_test_message_passing( + backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE) + + @test weak_strong == 0 + end + + @testset "Store" begin + _, _, _, weak_relaxed, _ = run_test_store( + backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE) + _, _, _, weak_strong, _ = run_test_store( + backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE) + + @test weak_strong == 0 + end + + @testset "Read" begin + _, _, _, weak_relaxed, _ = run_test_read( + backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE) + _, _, _, weak_strong, _ = run_test_read( + backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE) + + @test weak_strong == 0 + end + + @testset "Load Buffer" begin + _, _, _, weak_relaxed, _ = run_test_load_buffer( + backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE) + _, _, _, weak_strong, _ = run_test_load_buffer( + backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE) + + @test weak_strong == 0 + end + + @testset "Store Buffer" begin + _, _, _, weak_relaxed, _ = run_test_store_buffer( + backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE) + _, _, _, weak_strong, _ = run_test_store_buffer( + backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE) + + # Store Buffer: Acquire/Release does NOT prevent all weak behaviors + # (see GPUHarbor: "release/acquire barrier is not enough to disallow this behavior") + # We only verify that Acquire/Release reduces weak vs Relaxed + @test weak_strong <= weak_relaxed + end + + @testset "2+2 Write" begin + _, _, _, weak_relaxed, _ = run_test_2plus2w( + backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE) + _, _, _, weak_strong, _ = run_test_2plus2w( + backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE) + + @test weak_strong == 0 + end +end