diff --git a/Project.toml b/Project.toml
index 868d0a9..593bfdf 100644
--- a/Project.toml
+++ b/Project.toml
@@ -23,8 +23,9 @@ LLVM = "9"
 julia = "1.10"
 
 [extras]
+Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 
 [targets]
-test = ["Pkg", "Test"]
+test = ["Adapt", "Pkg", "Test"]
diff --git a/test/general_routine.jl b/test/general_routine.jl
index d65b009..b7a392d 100644
--- a/test/general_routine.jl
+++ b/test/general_routine.jl
@@ -3,4 +3,17 @@ include("tests/$TEST_BACKEND/access_fences.jl")
 include("tests/$TEST_BACKEND/vectorization_test.jl")
 
 include("tests/shfl.jl")
-include("tests/vectorization_custom_test.jl")
\ No newline at end of file
+include("tests/vectorization_custom_test.jl")
+
+# Memory ordering tests: Enable by launching julia with 'TEST_MEMORY_ORDERING=true julia'
+if get(ENV, "TEST_MEMORY_ORDERING", "false") == "true"
+    if Base.JLOptions().check_bounds != 0
+        @warn """
+            Bounds checking not set to 'auto' (current value: $(Base.JLOptions().check_bounds))
+                     Memory ordering tests may show inaccurate results and are skipped
+                     Run with `TEST_MEMORY_ORDERING=true julia --project -e 'using Pkg; Pkg.test(julia_args=[\"--check-bounds=auto\"])'`
+            """
+    else
+        include("tests/memory_ordering.jl")
+    end
+end
diff --git a/test/tests/litmus/load_buffer.jl b/test/tests/litmus/load_buffer.jl
new file mode 100644
index 0000000..dfe9c9f
--- /dev/null
+++ b/test/tests/litmus/load_buffer.jl
@@ -0,0 +1,251 @@
+# Litmus Test: Load Buffer (LB)
+#
+# Tests if loads can be buffered and re-ordered on different threads.
+#
+# Pattern: 
+#   Workgroup 0 Thread 0                Workgroup 1 Thread 0
+#       0.1: let r0 = atomicLoad(y)         1.1: let r1 = atomicLoad(x)
+#       0.2: atomicStore(x, 1)              1.2: atomicStore(y, 1)
+#
+# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/lb/load-buffer.wgsl
+#
+# (See end of file for complete example)
+
+using Adapt
+using KernelAbstractions
+using KernelIntrinsics
+using Test
+
+@kernel inbounds=true function test_load_buffer(
+    test_locations::AbstractArray{T},
+    results_r0::AbstractArray{T},
+    results_r1::AbstractArray{T},
+    n_pairs::T,
+    ::Val{RELAXED}=Val(true),
+    ::Val{wgXSize}=Val(256),    # workgroupXSize
+    ::Val{test_wg}=Val(2),      # testing_workgroups
+    ::Val{perm1}=Val(419),      # permute_first
+    ::Val{perm2}=Val(1031),     # permute_second
+    ::Val{stride}=Val(1)        # mem_stride
+) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride}
+
+    i = @index(Global, Linear)
+
+    if i <= n_pairs
+        local_invocation_id = (i - 1) % wgXSize
+        shuffled_workgroup = (i - 1) ÷ wgXSize
+        total_ids = wgXSize * test_wg
+        id_0 = shuffled_workgroup * wgXSize + local_invocation_id
+        new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg
+        id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize)
+
+        # Load Buffer pattern: Load, Store, Load, Store
+        y_0 = (id_0) * stride * 2
+        x_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1  # location_offset
+        x_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1  # location_offset
+        y_1 = (id_1) * stride * 2
+
+        if RELAXED
+            r0 = @access Relaxed test_locations[y_0 + 1]
+            @access Relaxed test_locations[x_0 + 1] = T(1)
+            r1 = @access Relaxed test_locations[x_1 + 1]
+            @access Relaxed test_locations[y_1 + 1] = T(1)
+        else
+            r0 = @access Acquire test_locations[y_0 + 1]
+            @access Release test_locations[x_0 + 1] = T(1)
+            r1 = @access Acquire test_locations[x_1 + 1]
+            @access Release test_locations[y_1 + 1] = T(1)
+        end
+
+        results_r1[id_1 + 1] = r1
+        results_r0[id_0 + 1] = r0
+    end
+end
+
+function run_test_load_buffer(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false)
+    if VERBOSE
+        println("\n" * "-" ^ 60)
+        if RELAXED
+            println("Litmus Test: Load Buffer (@access Relaxed)\n")
+        else
+            println("Litmus Test: Load Buffer (@access Acquire/Release)\n")
+        end
+    end
+
+    test_locations = adapt(backend, zeros(Int32, 2048))
+    results_r0 = adapt(backend, zeros(Int32, n_pairs))
+    results_r1 = adapt(backend, zeros(Int32, n_pairs))
+
+    total_seq0 = 0
+    total_seq1 = 0
+    total_interleaved = 0
+    total_weak = 0
+
+    for iter in 1:n_iterations
+        fill!(test_locations, Int32(0))
+        fill!(results_r0, Int32(0))
+        fill!(results_r1, Int32(0))
+
+        test_load_buffer(backend)(
+            test_locations, results_r0, results_r1,
+            Int32(n_pairs), Val(RELAXED);
+            ndrange=n_pairs, workgroupsize=256
+        )
+        KernelAbstractions.synchronize(backend)
+
+        for i in 1:n_pairs
+            # Calculate id_0 to read results (WGSL reads both r0 and r1 from id_0)
+            workgroupXSize = 256
+            testing_workgroups = 2
+            local_invocation_id = (i - 1) % workgroupXSize
+            shuffled_workgroup = (i - 1) ÷ workgroupXSize
+            id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id
+
+            r0 = Array(results_r0)[id_0 + 1]
+            r1 = Array(results_r1)[id_0 + 1]
+
+            # Load Buffer outcome classification (GPUHarbor categories)
+            if r0 == Int32(1) && r1 == Int32(0)
+                total_seq0 += 1
+            elseif r0 == Int32(0) && r1 == Int32(1)
+                total_seq1 += 1
+            elseif r0 == Int32(0) && r1 == Int32(0)
+                total_interleaved += 1
+            elseif r0 == Int32(1) && r1 == Int32(1)
+                total_weak += 1
+            end
+        end
+    end
+
+    total = total_seq0 + total_seq1 + total_interleaved + total_weak
+
+    if VERBOSE
+        println("  ╔═══════════════════════════════════════════════════╗")
+        println("  ║ RESULTS ($total total tests)                       ║")
+        println("  ╠═══════════════════════════════════════════════════╣")
+        println("  ║ r0=1, r1=0: (seq0)         $(lpad(total_seq0, 10)) ($(lpad(round(100*total_seq0/total, digits=2), 5))%)    ║")
+        println("  ║ r0=0, r1=1: (seq1)         $(lpad(total_seq1, 10)) ($(lpad(round(100*total_seq1/total, digits=2), 5))%)    ║")
+        println("  ║ r0=0, r1=0: (interleaved)  $(lpad(total_interleaved, 10)) ($(lpad(round(100*total_interleaved/total, digits=2), 5))%)    ║")
+        println("  ║ r0=1, r1=1: (WEAK/LB)      $(lpad(total_weak, 10)) ($(lpad(round(100*total_weak/total, digits=2), 5))%)    ║")
+        println("  ╚═══════════════════════════════════════════════════╝")
+    end
+    return (total_seq0, total_seq1, total_interleaved, total_weak, total)
+end
+
+"""
+As illustration we will represent the different memory operations on threads 0 and 256.
+
+Here are the different memory locations calculated as per the kernel.
+
+┌─────┬──────┬──────┬──────┬──────┬──────┬──────┐
+│  i  │ id_0 │ id_1 │ y_0  │ x_0  │ x_1  │ y_1  │
+├─────┼──────┼──────┼──────┼──────┼──────┼──────┤
+│  1  │   0  │ 256  │   0  │   1  │ 513  │ 512  │
+├─────┼──────┼──────┼──────┼──────┼──────┼──────┤
+│ 257 │  256 │   0  │ 512  │ 513  │   1  │   0  │
+└─────┴──────┴──────┴──────┴──────┴──────┴──────┘
+
+We reprensent the different addresses using boxes using :
+    Memory addresses in test_locations :  0     1    512   513
+    Memory addresses in results_r0     :  0     256
+    Memory addresses in results_r1     :  0     256
+
+We will illustrate the case where all operations on thread 0 happen before thread 256 and are ordered
+
+Step 1: Thread (i=1) loads r0 from location y_0=0
+┌─────┬─────┬─────┬─────┐
+│  0  │  0  │  0  │  0  │
+└─────┴─────┴─────┴─────┘
+   ↓
+  r_0
+
+Step 2: Thread (i=1) stores 1 to location x_0=1
+┌─────┬─────┬─────┬─────┐
+│  0  │  1  │  0  │  0  │
+└─────┴─────┴─────┴─────┘
+         ↑
+         1
+
+Step 3: Thread (i=1) loads r1 from location x_1=513
+┌─────┬─────┬─────┬─────┐
+│  0  │  1  │  0  │  0  │
+└─────┴─────┴─────┴─────┘
+                     ↓
+                    r_1
+
+Step 4: Thread (i=1) stores 1 to location y_1=512
+┌─────┬─────┬─────┬─────┐
+│  0  │  1  │  1  │  0  │
+└─────┴─────┴─────┴─────┘
+               ↑
+               1
+
+Step 5: Thread (i=1) stores r1 to location id_1=256 of results_r1
+┌─────┬─────┐
+│  0  │  0  │
+└─────┴─────┘
+         ↑
+        r_1
+
+Step 6: Thread (i=1) stores r0 to location id_0=0 of results_r0
+┌─────┬─────┐
+│  0  │  0  │
+└─────┴─────┘
+   ↑
+  r_0
+
+Step 7: Thread (i=257) loads r0 from location y_0=512
+┌─────┬─────┬─────┬─────┐
+│  0  │  1  │  1  │  0  │
+└─────┴─────┴─────┴─────┘
+               ↓
+              r_0
+
+Step 8: Thread (i=257) stores 1 to location x_0=513
+┌─────┬─────┬─────┬─────┐
+│  0  │  1  │  1  │  1  │
+└─────┴─────┴─────┴─────┘
+                     ↑
+                     1
+
+Step 9: Thread (i=257) loads r1 from location x_1=1
+┌─────┬─────┬─────┬─────┐
+│  0  │  1  │  1  │  1  │
+└─────┴─────┴─────┴─────┘
+         ↓
+        r_1
+    
+Step 10: Thread (i=257) stores 1 to location y_1=0
+┌─────┬─────┬─────┬─────┐
+│  1  │  1  │  1  │  1  │
+└─────┴─────┴─────┴─────┘
+   ↑
+   1
+             
+Step 11: Thread (i=257) stores r1 to location id_1=0 of results_r1
+┌─────┬─────┐
+│  1  │  0  │
+└─────┴─────┘
+   ↑
+  r_1
+
+Step 12: Thread (i=257) stores r0 to location id_0=256 of results_r0
+┌─────┬─────┐
+│  0  │  1  │
+└─────┴─────┘
+         ↑
+        r_0
+
+Step 13: results_r0 and results_r1 are read at location id_0=0 for categorization
+    r_0_test = 0
+    r_1_test = 1
+
+This is categorized as 'sequential'
+
+
+Here, if Step 10 happened before Step 1 (reordering) then 
+    r_0_test = 1
+    r_1_test = 1
+
+This is categorized as 'weak'.
+"""
diff --git a/test/tests/litmus/message_passing.jl b/test/tests/litmus/message_passing.jl
new file mode 100644
index 0000000..273aeb8
--- /dev/null
+++ b/test/tests/litmus/message_passing.jl
@@ -0,0 +1,131 @@
+# Litmus Test: Message Passing (MP)
+#
+# Tests if two stores in one thread can be re-ordered according to loads on a second thread.
+#
+# Pattern: 
+#   Workgroup 0 Thread 0                Workgroup 1 Thread 0
+#       0.1: atomicStore(x, 1)              1.1: let r0 = atomicLoad(y)
+#       0.2: atomicStore(y, 1)              1.2: let r1 = atomicLoad(x)
+#
+# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/mp/message-passing.wgsl
+
+using Adapt
+using KernelAbstractions
+using KernelIntrinsics
+using Test
+
+@kernel inbounds=true function test_message_passing(
+    test_locations::AbstractArray{T},
+    results_r0::AbstractArray{T},
+    results_r1::AbstractArray{T},
+    n_pairs::T,
+    ::Val{RELAXED}=Val(true),
+    ::Val{wgXSize}=Val(256),
+    ::Val{test_wg}=Val(2),
+    ::Val{perm1}=Val(419),
+    ::Val{perm2}=Val(1031),
+    ::Val{stride}=Val(1)
+) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride}
+
+    i = @index(Global, Linear)
+
+    if i <= n_pairs
+        local_invocation_id = (i - 1) % wgXSize
+        shuffled_workgroup = (i - 1) ÷ wgXSize
+        total_ids = wgXSize * test_wg
+        id_0 = shuffled_workgroup * wgXSize + local_invocation_id
+        new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg
+        id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize)
+
+        # Message Passing pattern: Store, Store, Load, Load
+        x_0 = (id_0) * stride * 2
+        y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1  # location_offset
+        x_1 = (id_1) * stride * 2
+        y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1  # location_offset
+
+        if RELAXED
+            @access Relaxed test_locations[x_0 + 1] = T(1)
+            @access Relaxed test_locations[y_0 + 1] = T(1)
+            r0 = @access Relaxed test_locations[y_1 + 1]
+            r1 = @access Relaxed test_locations[x_1 + 1]
+        else
+            @access Release test_locations[x_0 + 1] = T(1)
+            @access Release test_locations[y_0 + 1] = T(1)
+            r0 = @access Acquire test_locations[y_1 + 1]
+            r1 = @access Acquire test_locations[x_1 + 1]
+        end
+
+        results_r1[id_1 + 1] = r1
+        results_r0[id_1 + 1] = r0
+    end
+end
+
+function run_test_message_passing(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false)
+    if VERBOSE
+        println("\n" * "-" ^ 60)
+        if RELAXED
+            println("Litmus Test: Message Passing (@access Relaxed)\n")
+        else
+            println("Litmus Test: Message Passing (@access Acquire/Release)\n")
+        end
+    end
+
+    test_locations = adapt(backend, zeros(Int32, 2048))
+    results_r0 = adapt(backend, zeros(Int32, n_pairs))
+    results_r1 = adapt(backend, zeros(Int32, n_pairs))
+
+    total_r0_0_r1_0 = 0
+    total_r0_0_r1_1 = 0
+    total_r0_1_r1_0 = 0
+    total_r0_1_r1_1 = 0
+
+    for iter in 1:n_iterations
+        fill!(test_locations, Int32(0))
+        fill!(results_r0, Int32(0))
+        fill!(results_r1, Int32(0))
+
+        test_message_passing(backend)(
+            test_locations, results_r0, results_r1,
+            Int32(n_pairs), Val(RELAXED);
+            ndrange=n_pairs, workgroupsize=256
+        )
+        KernelAbstractions.synchronize(backend)
+
+        for i in 1:n_pairs
+            # Calculate id_0 to read results (WGSL reads from id_0)
+            workgroupXSize = 256
+            testing_workgroups = 2
+            local_invocation_id = (i - 1) % workgroupXSize
+            shuffled_workgroup = (i - 1) ÷ workgroupXSize
+            id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id
+
+            r0 = Array(results_r0)[id_0 + 1]
+            r1 = Array(results_r1)[id_0 + 1]
+
+            # Message Passing outcome classification (GPUHarbor categories)
+            if r0 == Int32(0) && r1 == Int32(0)
+                total_r0_0_r1_0 += 1
+            elseif r0 == Int32(0) && r1 == Int32(1)
+                total_r0_0_r1_1 += 1
+            elseif r0 == Int32(1) && r1 == Int32(0)
+                total_r0_1_r1_0 += 1
+            elseif r0 == Int32(1) && r1 == Int32(1)
+                total_r0_1_r1_1 += 1
+            end
+        end
+    end
+
+    total = total_r0_0_r1_0 + total_r0_0_r1_1 + total_r0_1_r1_0 + total_r0_1_r1_1
+
+    if VERBOSE
+        println("  ╔═══════════════════════════════════════════════════╗")
+        println("  ║ RESULTS ($total total tests)                       ║")
+        println("  ╠═══════════════════════════════════════════════════╣")
+        println("  ║ r0=0, r1=0: (sequential)    $(lpad(total_r0_0_r1_0, 10)) ($(lpad(round(100*total_r0_0_r1_0/total, digits=2), 5))%)   ║")
+        println("  ║ r0=1, r1=1: (sequential)    $(lpad(total_r0_1_r1_1, 10)) ($(lpad(round(100*total_r0_1_r1_1/total, digits=2), 5))%)   ║")
+        println("  ║ r0=0, r1=1: (interleaved)   $(lpad(total_r0_0_r1_1, 10)) ($(lpad(round(100*total_r0_0_r1_1/total, digits=2), 5))%)   ║")
+        println("  ║ r0=1, r1=0: (WEAK)          $(lpad(total_r0_1_r1_0, 10)) ($(lpad(round(100*total_r0_1_r1_0/total, digits=2), 5))%)   ║")
+        println("  ╚═══════════════════════════════════════════════════╝")
+    end
+    return (total_r0_0_r1_0, total_r0_0_r1_1, total_r0_1_r1_0, total_r0_1_r1_1, total)
+end
\ No newline at end of file
diff --git a/test/tests/litmus/read.jl b/test/tests/litmus/read.jl
new file mode 100644
index 0000000..ed937c5
--- /dev/null
+++ b/test/tests/litmus/read.jl
@@ -0,0 +1,136 @@
+# Litmus Test: Read
+#
+# Tests if two stores in one thread can be re-ordered according to a store and a load on a second thread
+#
+# Pattern:
+#   Workgroup 0 Thread 0                Workgroup 1 Thread 0
+#       0.1: atomicStore(x, 1)              1.1: atomicStore(y, 2)
+#       0.2: atomicStore(y, 1)              1.2: let r0 = atomicLoad(x)
+#
+# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/read/read.wgsl
+
+using Adapt
+using KernelAbstractions
+using KernelIntrinsics
+using Test
+
+@kernel inbounds=true function test_read(
+    test_locations::AbstractArray{T},
+    results_r0::AbstractArray{T},
+    n_pairs::T,
+    ::Val{RELAXED}=Val(true),
+    ::Val{wgXSize}=Val(256),
+    ::Val{test_wg}=Val(2),
+    ::Val{perm1}=Val(419),
+    ::Val{perm2}=Val(1031),
+    ::Val{stride}=Val(1)
+) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride}
+
+    i = @index(Global, Linear)
+
+    if i <= n_pairs
+        local_invocation_id = (i - 1) % wgXSize
+        shuffled_workgroup = (i - 1) ÷ wgXSize
+        total_ids = wgXSize * test_wg
+        id_0 = shuffled_workgroup * wgXSize + local_invocation_id
+        new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg
+        id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize)
+
+        # Read pattern: Store, Store, Store, Load
+        x_0 = (id_0) * stride * 2
+        y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1
+        y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1
+        x_1 = (id_1) * stride * 2
+
+        if RELAXED
+            @access Relaxed test_locations[x_0 + 1] = T(1)
+            @access Relaxed test_locations[y_0 + 1] = T(1)
+            @access Relaxed test_locations[y_1 + 1] = T(2)
+            r0 = @access Relaxed test_locations[x_1 + 1]
+        else
+            @access Release test_locations[x_0 + 1] = T(1)
+            @access Release test_locations[y_0 + 1] = T(1)
+            @access Release test_locations[y_1 + 1] = T(2)
+            r0 = @access Acquire test_locations[x_1 + 1]
+        end
+
+        results_r0[id_1 + 1] = r0
+    end
+end
+
+function run_test_read(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false)
+    if VERBOSE
+        println("\n" * "-" ^ 60)
+        if RELAXED
+            println("Litmus Test: Read (@access Relaxed)\n")
+        else
+            println("Litmus Test: Read (@access Acquire/Release)\n")
+        end
+    end
+
+    test_locations = adapt(backend, zeros(Int32, 2048))
+    results_r0 = adapt(backend, zeros(Int32, n_pairs))
+
+    r0_1_y_2 = 0
+    r0_0_y_1 = 0
+    r0_1_y_1 = 0
+    r0_0_y_2 = 0
+
+    for iter in 1:n_iterations
+        fill!(test_locations, Int32(0))
+        fill!(results_r0, Int32(0))
+
+        test_read(backend)(
+            test_locations, results_r0,
+            Int32(n_pairs), Val(RELAXED);
+            ndrange=n_pairs, workgroupsize=256
+        )
+        KernelAbstractions.synchronize(backend)
+
+        test_cpu = Array(test_locations)
+
+        for i in 1:n_pairs
+            # Calculate id_0 to read results (WGSL reads r0 from id_0)
+            workgroupXSize = 256
+            testing_workgroups = 2
+            permute_second = 1031
+            local_invocation_id = (i - 1) % workgroupXSize
+            shuffled_workgroup = (i - 1) ÷ workgroupXSize
+            total_ids = workgroupXSize * testing_workgroups
+            id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id
+
+            r0 = Array(results_r0)[id_0 + 1]
+
+            # Calculate y_0 address to read final value
+            y_0_addr = ((id_0 * permute_second) % total_ids) * 1 * 2 + 1 + 1
+
+            # Read the actual final value at y_0 (GPUHarbor's *y)
+            y_final = test_cpu[y_0_addr]
+
+            # Categorize based on both r0 and final y_0 value (GPUHarbor categories)
+            if r0 == Int32(1) && y_final == Int32(2)
+                r0_1_y_2 += 1
+            elseif r0 == Int32(0) && y_final == Int32(1)
+                r0_0_y_1 += 1
+            elseif r0 == Int32(1) && y_final == Int32(1)
+                r0_1_y_1 += 1
+            elseif r0 == Int32(0) && y_final == Int32(2)
+                r0_0_y_2 += 1
+            end
+        end
+    end
+
+    total = r0_1_y_2 + r0_0_y_1 + r0_1_y_1 + r0_0_y_2
+
+    if VERBOSE
+        println("  ╔═══════════════════════════════════════════════════╗")
+        println("  ║ RESULTS ($total total tests)                       ║")
+        println("  ╠═══════════════════════════════════════════════════╣")
+        println("  ║ r0=1, y=2: (sequential)    $(lpad(r0_1_y_2, 10)) ($(lpad(round(100*r0_1_y_2/total, digits=2), 5))%)    ║")
+        println("  ║ r0=0, y=1: (sequential)    $(lpad(r0_0_y_1, 10)) ($(lpad(round(100*r0_0_y_1/total, digits=2), 5))%)    ║")
+        println("  ║ r0=1, y=1: (interleaved)   $(lpad(r0_1_y_1, 10)) ($(lpad(round(100*r0_1_y_1/total, digits=2), 5))%)    ║")
+        println("  ║ r0=0, y=2: (WEAK)          $(lpad(r0_0_y_2, 10)) ($(lpad(round(100*r0_0_y_2/total, digits=2), 5))%)    ║")
+        println("  ╚═══════════════════════════════════════════════════╝")
+    end
+    return (r0_1_y_2, r0_0_y_1, r0_1_y_1, r0_0_y_2, total)
+end
diff --git a/test/tests/litmus/store.jl b/test/tests/litmus/store.jl
new file mode 100644
index 0000000..82f3c9a
--- /dev/null
+++ b/test/tests/litmus/store.jl
@@ -0,0 +1,135 @@
+# Litmus Test: Store
+#
+# Tests if two stores in one thread can be re-ordered according to a
+# store and a load on a second thread.
+#
+# Pattern:
+#   Workgroup 0 Thread 0                Workgroup 1 Thread 0
+#       0.1: atomicStore(x, 2)              1.1: let r0 = atomicLoad(y)
+#       0.2: atomicStore(y, 1)              1.2: atomicStore(x, 1)
+#
+# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/store/store.wgsl
+
+using Adapt
+using KernelAbstractions
+using KernelIntrinsics
+using Test
+
+@kernel inbounds=true function test_store(
+    test_locations::AbstractArray{T},
+    results_r0::AbstractArray{T},
+    n_pairs::T,
+    ::Val{RELAXED}=Val(true),
+    ::Val{wgXSize}=Val(256),    # workgroupXSize
+    ::Val{test_wg}=Val(2),      # testing_workgroups
+    ::Val{perm1}=Val(419),      # permute_first
+    ::Val{perm2}=Val(1031),     # permute_second
+    ::Val{stride}=Val(1)        # mem_stride
+) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride}
+
+    i = @index(Global, Linear)
+
+    if i <= n_pairs
+        local_invocation_id = (i - 1) % wgXSize
+        shuffled_workgroup = (i - 1) ÷ wgXSize
+        total_ids = wgXSize * test_wg
+        id_0 = shuffled_workgroup * wgXSize + local_invocation_id
+        new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg
+        id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize)
+
+        # Store pattern: Store, Store, Load, Store
+        x_0 = (id_0) * stride * 2
+        y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1
+        y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1
+        x_1 = (id_1) * stride * 2
+
+        if RELAXED
+            @access Relaxed test_locations[x_0 + 1] = T(2)
+            @access Relaxed test_locations[y_0 + 1] = T(1)
+            r0 = @access Relaxed test_locations[y_1 + 1]
+            @access Relaxed test_locations[x_1 + 1] = T(1)
+        else
+            @access Release test_locations[x_0 + 1] = T(2)
+            @access Release test_locations[y_0 + 1] = T(1)
+            r0 = @access Acquire test_locations[y_1 + 1]
+            @access Release test_locations[x_1 + 1] = T(1)
+        end
+
+        results_r0[id_1 + 1] = r0
+    end
+end
+
+function run_test_store(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=false)
+    if VERBOSE
+        println("\n" * "-" ^ 60)
+        if RELAXED
+            println("Litmus Test: Store (@access Relaxed)\n")
+        else
+            println("Litmus Test: Store (@access Acquire/Release)\n")
+        end
+    end
+
+    test_locations = adapt(backend, zeros(Int32, 2048))
+    results_r0 = adapt(backend, zeros(Int32, n_pairs))
+
+    r0_1_x_1 = 0
+    r0_0_x_2 = 0
+    r0_0_x_1 = 0
+    r0_1_x_2 = 0
+
+    for iter in 1:n_iterations
+        fill!(test_locations, Int32(0))
+        fill!(results_r0, Int32(0))
+
+        test_store(backend)(
+            test_locations, results_r0,
+            Int32(n_pairs), Val(RELAXED);
+            ndrange=n_pairs, workgroupsize=256
+        )
+        KernelAbstractions.synchronize(backend)
+
+        test_cpu = Array(test_locations)
+
+        for i in 1:n_pairs
+            # Calculate id_0 to read results (WGSL reads r0 from id_0)
+            workgroupXSize = 256
+            testing_workgroups = 2
+            local_invocation_id = (i - 1) % workgroupXSize
+            shuffled_workgroup = (i - 1) ÷ workgroupXSize
+            id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id
+
+            r0 = Array(results_r0)[id_0 + 1]
+
+            # Calculate x_0 address to read final value (GPUHarbor reads from x_0, not x_1!)
+            x_0_addr = id_0 * 1 * 2 + 1
+
+            # Read the actual final value at x_0 (this is what GPUHarbor calls *x)
+            x_final = test_cpu[x_0_addr]
+
+            # Categorize based on both r0 and final x_0 value (GPUHarbor categories)
+            if r0 == Int32(1) && x_final == Int32(1)
+                r0_1_x_1 += 1
+            elseif r0 == Int32(0) && x_final == Int32(2)
+                r0_0_x_2 += 1
+            elseif r0 == Int32(0) && x_final == Int32(1)
+                r0_0_x_1 += 1
+            elseif r0 == Int32(1) && x_final == Int32(2)
+                r0_1_x_2 += 1
+            end
+        end
+    end
+
+    total = r0_1_x_1 + r0_0_x_2 + r0_0_x_1 + r0_1_x_2
+
+    if VERBOSE
+        println("  ╔═══════════════════════════════════════════════════╗")
+        println("  ║ RESULTS ($total total tests)                       ║")
+        println("  ╠═══════════════════════════════════════════════════╣")
+        println("  ║ r0=1, x=1: (sequential)   $(lpad(r0_1_x_1, 10)) ($(lpad(round(100*r0_1_x_1/total, digits=2), 5))%)     ║")
+        println("  ║ r0=0, x=2: (sequential)   $(lpad(r0_0_x_2, 10)) ($(lpad(round(100*r0_0_x_2/total, digits=2), 5))%)     ║")
+        println("  ║ r0=0, x=1: (interleaved)  $(lpad(r0_0_x_1, 10)) ($(lpad(round(100*r0_0_x_1/total, digits=2), 5))%)     ║")
+        println("  ║ r0=1, x=2: (WEAK)         $(lpad(r0_1_x_2, 10)) ($(lpad(round(100*r0_1_x_2/total, digits=2), 5))%)     ║")
+        println("  ╚═══════════════════════════════════════════════════╝")
+    end
+    return (r0_1_x_1, r0_0_x_2, r0_0_x_1, r0_1_x_2, total)
+end
diff --git a/test/tests/litmus/store_buffer.jl b/test/tests/litmus/store_buffer.jl
new file mode 100644
index 0000000..8f42d76
--- /dev/null
+++ b/test/tests/litmus/store_buffer.jl
@@ -0,0 +1,132 @@
+# Litmus Test: Store Buffer (SB)
+#
+# Tests if stores can be buffered and re-ordered on different threads.
+# /!\ A release/acquire barrier is not enough to disallow this behavior.
+#
+# Pattern:
+#   Workgroup 0 Thread 0                Workgroup 1 Thread 0
+#       0.1: atomicStore(x, 1)              1.1: atomicStore(y, 1)
+#       0.2: let r0 = atomicLoad(y)         1.2: let r1 = atomicLoad(x)
+#
+# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/sb/store-buffer.wgsl
+
+using Adapt
+using KernelAbstractions
+using KernelIntrinsics
+using Test
+
+@kernel inbounds=true function test_store_buffer(
+    test_locations::AbstractArray{T},
+    results_r0::AbstractArray{T},
+    results_r1::AbstractArray{T},
+    n_pairs::T,
+    ::Val{RELAXED}=Val(true),
+    ::Val{wgXSize}=Val(256),
+    ::Val{test_wg}=Val(2),
+    ::Val{perm1}=Val(419),
+    ::Val{perm2}=Val(1031),
+    ::Val{stride}=Val(1)
+) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride}
+
+    i = @index(Global, Linear)
+
+    if i <= n_pairs
+        local_invocation_id = (i - 1) % wgXSize
+        shuffled_workgroup = (i - 1) ÷ wgXSize
+        total_ids = wgXSize * test_wg
+        id_0 = shuffled_workgroup * wgXSize + local_invocation_id
+        new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg
+        id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize)
+
+        # Store Buffer pattern: Store, Load, Store, Load
+        x_0 = (id_0) * stride * 2
+        y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1
+        y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1
+        x_1 = (id_1) * stride * 2
+
+        if RELAXED
+            @access Relaxed test_locations[x_0 + 1] = T(1)
+            r0 = @access Relaxed test_locations[y_0 + 1]
+            @access Relaxed test_locations[y_1 + 1] = T(1)
+            r1 = @access Relaxed test_locations[x_1 + 1]
+        else
+            @access Release test_locations[x_0 + 1] = T(1)
+            r0 = @access Acquire test_locations[y_0 + 1]
+            @access Release test_locations[y_1 + 1] = T(1)
+            r1 = @access Acquire test_locations[x_1 + 1]
+        end
+
+        results_r1[id_1 + 1] = r1
+        results_r0[id_0 + 1] = r0
+    end
+end
+
+function run_test_store_buffer(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=true)
+    if VERBOSE
+        println("\n" * "-" ^ 60)
+        if RELAXED
+            println("Litmus Test: Store Buffer (@access Relaxed)\n")
+        else
+            println("Litmus Test: Store Buffer (@access Acquire/Release)\n")
+        end
+    end
+
+    test_locations = adapt(backend, zeros(Int32, 2048))
+    results_r0 = adapt(backend, zeros(Int32, n_pairs))
+    results_r1 = adapt(backend, zeros(Int32, n_pairs))
+
+    total_seq0 = 0
+    total_seq1 = 0
+    total_interleaved = 0
+    total_weak = 0
+
+    for iter in 1:n_iterations
+        fill!(test_locations, Int32(0))
+        fill!(results_r0, Int32(0))
+        fill!(results_r1, Int32(0))
+
+        test_store_buffer(backend)(
+            test_locations, results_r0, results_r1,
+            Int32(n_pairs), Val(RELAXED);
+            ndrange=n_pairs, workgroupsize=256
+        )
+        KernelAbstractions.synchronize(backend)
+
+        for i in 1:n_pairs
+            # Calculate id_0 to read results (WGSL reads both r0 and r1 from id_0)
+            workgroupXSize = 256
+            testing_workgroups = 2
+            local_invocation_id = (i - 1) % workgroupXSize
+            shuffled_workgroup = (i - 1) ÷ workgroupXSize
+            id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id
+
+            r0 = Array(results_r0)[id_0 + 1]
+            r1 = Array(results_r1)[id_0 + 1]
+
+            # Store Buffer outcome classification (GPUHarbor categories)
+            if r0 == Int32(1) && r1 == Int32(0)
+                total_seq0 += 1
+            elseif r0 == Int32(0) && r1 == Int32(1)
+                total_seq1 += 1
+            elseif r0 == Int32(1) && r1 == Int32(1)
+                total_interleaved += 1
+            elseif r0 == Int32(0) && r1 == Int32(0)
+                total_weak += 1
+            end
+        end
+    end
+
+    total = total_seq0 + total_seq1 + total_interleaved + total_weak
+
+    if VERBOSE
+        println("  ╔═══════════════════════════════════════════════════╗")
+        println("  ║ RESULTS ($total total tests)                       ║")
+        println("  ╠═══════════════════════════════════════════════════╣")
+        println("  ║ r0=1, r1=0: (seq0)         $(lpad(total_seq0, 10)) ($(lpad(round(100*total_seq0/total, digits=2), 5))%)    ║")
+        println("  ║ r0=0, r1=1: (seq1)         $(lpad(total_seq1, 10)) ($(lpad(round(100*total_seq1/total, digits=2), 5))%)    ║")
+        println("  ║ r0=1, r1=1: (interleaved)  $(lpad(total_interleaved, 10)) ($(lpad(round(100*total_interleaved/total, digits=2), 5))%)    ║")
+        println("  ║ r0=0, r1=0: (WEAK/SB)      $(lpad(total_weak, 10)) ($(lpad(round(100*total_weak/total, digits=2), 5))%)    ║")
+        println("  ╚═══════════════════════════════════════════════════╝")
+    end
+    return (total_seq0, total_seq1, total_interleaved, total_weak, total)
+end
diff --git a/test/tests/litmus/write_2plus2w.jl b/test/tests/litmus/write_2plus2w.jl
new file mode 100644
index 0000000..f641d18
--- /dev/null
+++ b/test/tests/litmus/write_2plus2w.jl
@@ -0,0 +1,130 @@
+# Litmus Test: 2+2W (Write)
+#
+# Tests if two stores in two threads can both be re-ordered.
+#
+# Pattern:
+#   Workgroup 0 Thread 0                Workgroup 1 Thread 0
+#       0.1: atomicStore(x, 2)              1.1: atomicStore(y, 2)
+#       0.2: atomicStore(y, 1)              1.2: atomicStore(x, 1)
+#
+# Based on https://github.com/reeselevine/webgpu-litmus/blob/main/shaders/2+2/2+2-write.wgsl
+
+using Adapt
+using KernelAbstractions
+using KernelIntrinsics
+using Test
+
+@kernel inbounds=true function test_write_2plus2w(
+    test_locations::AbstractArray{T},
+    n_pairs::T,
+    ::Val{RELAXED}=Val(true),
+    ::Val{wgXSize}=Val(256),
+    ::Val{test_wg}=Val(2),
+    ::Val{perm1}=Val(419),
+    ::Val{perm2}=Val(1031),
+    ::Val{stride}=Val(1)
+) where {T, RELAXED, wgXSize, test_wg, perm1, perm2, stride}
+
+    i = @index(Global, Linear)
+
+    if i <= n_pairs
+        local_invocation_id = (i - 1) % wgXSize
+        shuffled_workgroup = (i - 1) ÷ wgXSize
+        total_ids = wgXSize * test_wg
+        id_0 = shuffled_workgroup * wgXSize + local_invocation_id
+        new_workgroup = (shuffled_workgroup + 1 + (local_invocation_id % (test_wg - 1))) % test_wg
+        id_1 = new_workgroup * wgXSize + ((local_invocation_id * perm1) % wgXSize)
+
+        # 2+2W pattern: Store, Store, Store, Store (no loads)
+        x_0 = (id_0) * stride * 2
+        y_0 = ((id_0 * perm2) % total_ids) * stride * 2 + 1
+        y_1 = ((id_1 * perm2) % total_ids) * stride * 2 + 1
+        x_1 = (id_1) * stride * 2
+
+        if RELAXED
+            @access Relaxed test_locations[x_0 + 1] = T(2)
+            @access Relaxed test_locations[y_0 + 1] = T(1)
+            @access Relaxed test_locations[y_1 + 1] = T(2)
+            @access Relaxed test_locations[x_1 + 1] = T(1)
+        else
+            @access Release test_locations[x_0 + 1] = T(2)
+            @access Release test_locations[y_0 + 1] = T(1)
+            @access Release test_locations[y_1 + 1] = T(2)
+            @access Release test_locations[x_1 + 1] = T(1)
+        end
+    end
+end
+
+function run_test_2plus2w(backend; n_iterations::Int=100, n_pairs::Int=512, RELAXED=true, VERBOSE=true)
+    if VERBOSE
+        println("\n" * "-" ^ 60)
+        if RELAXED
+            println("Litmus Test: 2+2W Write (@access Relaxed)\n")
+        else
+            println("Litmus Test: 2+2W Write (@access Release)\n")
+        end
+    end
+
+    test_locations = adapt(backend, zeros(Int32, 2048))
+
+    x1_y2 = 0
+    x2_y1 = 0
+    x1_y1 = 0
+    x2_y2 = 0
+
+    for iter in 1:n_iterations
+        fill!(test_locations, Int32(0))
+
+        test_write_2plus2w(backend)(
+            test_locations,
+            Int32(n_pairs), Val(RELAXED);
+            ndrange=n_pairs, workgroupsize=256
+        )
+        KernelAbstractions.synchronize(backend)
+
+        test_cpu = Array(test_locations)
+
+        for i in 1:n_pairs
+            # Calculate id_0 to find x_0 and y_0 addresses
+            workgroupXSize = 256
+            testing_workgroups = 2
+            permute_second = 1031
+            local_invocation_id = (i - 1) % workgroupXSize
+            shuffled_workgroup = (i - 1) ÷ workgroupXSize
+            total_ids = workgroupXSize * testing_workgroups
+            id_0 = shuffled_workgroup * workgroupXSize + local_invocation_id
+
+            x_0_addr = id_0 * 1 * 2 + 1
+            y_0_addr = ((id_0 * permute_second) % total_ids) * 1 * 2 + 1 + 1
+
+            # Read the actual final values at x_0 and y_0 (GPUHarbor's *x and *y)
+            mem_x_0 = test_cpu[x_0_addr]
+            mem_y_0 = test_cpu[y_0_addr]
+
+            # Categorize based on (mem_x_0, mem_y_0) pairs (GPUHarbor categories)
+            if mem_x_0 == Int32(1) && mem_y_0 == Int32(2)
+                x1_y2 += 1
+            elseif mem_x_0 == Int32(2) && mem_y_0 == Int32(1)
+                x2_y1 += 1
+            elseif mem_x_0 == Int32(1) && mem_y_0 == Int32(1)
+                x1_y1 += 1
+            else
+                x2_y2 += 1
+            end
+        end
+    end
+
+    total = x1_y2 + x2_y1 + x1_y1 + x2_y2
+
+    if VERBOSE
+        println("  ╔═══════════════════════════════════════════════════╗")
+        println("  ║ RESULTS ($total iterations)                        ║")
+        println("  ╠═══════════════════════════════════════════════════╣")
+        println("  ║ x=1, y=2: (sequential)     $(lpad(x1_y2, 10)) ($(lpad(round(100*x1_y2/total, digits=2), 5))%)    ║")
+        println("  ║ x=2, y=1: (sequential)     $(lpad(x2_y1, 10)) ($(lpad(round(100*x2_y1/total, digits=2), 5))%)    ║")
+        println("  ║ x=1, y=1: (interleaved)    $(lpad(x1_y1, 10)) ($(lpad(round(100*x1_y1/total, digits=2), 5))%)    ║")
+        println("  ║ x=2, y=2: (weak)           $(lpad(x2_y2, 10)) ($(lpad(round(100*x2_y2/total, digits=2), 5))%)    ║")
+        println("  ╚═══════════════════════════════════════════════════╝")
+    end
+    return (x1_y2, x2_y1, x1_y1, x2_y2, total)
+end
diff --git a/test/tests/memory_ordering.jl b/test/tests/memory_ordering.jl
new file mode 100644
index 0000000..4fe7ce6
--- /dev/null
+++ b/test/tests/memory_ordering.jl
@@ -0,0 +1,70 @@
+using Test
+
+# Memory ordering test verbosity: Enable with VERBOSE_MEMORY_ORDERING=true
+const VERBOSE = get(ENV, "VERBOSE_MEMORY_ORDERING", "false") == "true"
+
+include("litmus/message_passing.jl")
+include("litmus/store.jl")
+include("litmus/read.jl")
+include("litmus/load_buffer.jl")
+include("litmus/store_buffer.jl")
+include("litmus/write_2plus2w.jl")
+
+@testset "Memory Ordering Litmus Tests" begin
+    @testset "Message Passing" begin
+        _, _, weak_relaxed, _, _ = run_test_message_passing(
+            backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE)
+        _, _, weak_strong, _, _ = run_test_message_passing(
+            backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE)
+
+        @test weak_strong == 0
+    end
+
+    @testset "Store" begin
+        _, _, _, weak_relaxed, _ = run_test_store(
+            backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE)
+        _, _, _, weak_strong, _ = run_test_store(
+            backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE)
+
+        @test weak_strong == 0
+    end
+
+    @testset "Read" begin
+        _, _, _, weak_relaxed, _ = run_test_read(
+            backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE)
+        _, _, _, weak_strong, _ = run_test_read(
+            backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE)
+
+        @test weak_strong == 0
+    end
+
+    @testset "Load Buffer" begin
+        _, _, _, weak_relaxed, _ = run_test_load_buffer(
+            backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE)
+        _, _, _, weak_strong, _ = run_test_load_buffer(
+            backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE)
+
+        @test weak_strong == 0
+    end
+
+    @testset "Store Buffer" begin
+        _, _, _, weak_relaxed, _ = run_test_store_buffer(
+            backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE)
+        _, _, _, weak_strong, _ = run_test_store_buffer(
+            backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE)
+
+        # Store Buffer: Acquire/Release does NOT prevent all weak behaviors
+        # (see GPUHarbor: "release/acquire barrier is not enough to disallow this behavior")
+        # We only verify that Acquire/Release reduces weak vs Relaxed
+        @test weak_strong <= weak_relaxed
+    end
+
+    @testset "2+2 Write" begin
+        _, _, _, weak_relaxed, _ = run_test_2plus2w(
+            backend, n_iterations=100, n_pairs=512, RELAXED=true, VERBOSE=VERBOSE)
+        _, _, _, weak_strong, _ = run_test_2plus2w(
+            backend, n_iterations=100, n_pairs=512, RELAXED=false, VERBOSE=VERBOSE)
+
+        @test weak_strong == 0
+    end
+end