From dab3e06fbce2b2fe639271f8fa6ce9965bbefbd6 Mon Sep 17 00:00:00 2001 From: "Lee, Sang Ik" Date: Mon, 8 Jun 2026 16:43:46 +0000 Subject: [PATCH] [CRI][MXFP] Add xegpu lane level dpas_mx tests. --- .../Dialect/XeGPU/CRI/SIMT/lit.local.cfg | 6 + .../CRI/SIMT/xegpu_dpas_mx_prepacked_bf8.mlir | 130 +++++++++++++++ .../SIMT/xegpu_dpas_mx_prepacked_e2m1.mlir | 153 ++++++++++++++++++ .../Dialect/XeGPU/CRI/lit.local.cfg | 2 + 4 files changed, 291 insertions(+) create mode 100644 test/Integration/Dialect/XeGPU/CRI/SIMT/lit.local.cfg create mode 100644 test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_bf8.mlir create mode 100644 test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_e2m1.mlir create mode 100644 test/Integration/Dialect/XeGPU/CRI/lit.local.cfg diff --git a/test/Integration/Dialect/XeGPU/CRI/SIMT/lit.local.cfg b/test/Integration/Dialect/XeGPU/CRI/SIMT/lit.local.cfg new file mode 100644 index 000000000..b96177568 --- /dev/null +++ b/test/Integration/Dialect/XeGPU/CRI/SIMT/lit.local.cfg @@ -0,0 +1,6 @@ +depends_cri = [ + 'xegpu_dpas_mx_prepacked_bf8.mlir', + 'xegpu_dpas_mx_prepacked_e2m1.mlir', +] + +config.excludes.update(depends_cri) diff --git a/test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_bf8.mlir b/test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_bf8.mlir new file mode 100644 index 000000000..95a0ad2b7 --- /dev/null +++ b/test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_bf8.mlir @@ -0,0 +1,130 @@ +// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=lane zebin-chip=cri" \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @dpas_mx_bf8(%a: memref<8x32xf8E5M2>, %b: memref<32x16xf8E5M2>, %c: memref<8x16xf32>, %scale_a: memref<8x1xf8E8M0FNU>, %scale_b: memref<1x16xf8E8M0FNU>) kernel { + + %tdesc_a = xegpu.create_nd_tdesc %a : memref<8x32xf8E5M2> -> !xegpu.tensor_desc<8x32xf8E5M2> + %a_trunc = xegpu.load_nd %tdesc_a[0, 0] : !xegpu.tensor_desc<8x32xf8E5M2> -> vector<16xf8E5M2> + + %tdesc_b = xegpu.create_nd_tdesc %b : memref<32x16xf8E5M2> -> !xegpu.tensor_desc<32x16xf8E5M2> + %b_trunc = xegpu.load_nd %tdesc_b[0, 0] <{packed}> : !xegpu.tensor_desc<32x16xf8E5M2> -> vector<32xf8E5M2> + + %tdesc_c = xegpu.create_nd_tdesc %c : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %c_loaded = xegpu.load_nd %tdesc_c[0, 0] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> + + %id_x = gpu.thread_id x + %c8 = arith.constant 8 : index + %idx_x = arith.remsi %id_x, %c8 : index + %c0 = arith.constant 0 : index + %scale = memref.load %scale_a[%idx_x, %c0] : memref<8x1xf8E8M0FNU> + %scale_a_undef = arith.constant dense<1.0> : vector<1xf8E8M0FNU> + %scale_a_loaded = vector.insert %scale, %scale_a_undef[%c0] : f8E8M0FNU into vector<1xf8E8M0FNU> + + %scale_b_undef = arith.constant dense<1.0> : vector<1xf8E8M0FNU> + %scale_b_elem = memref.load %scale_b[%c0, %id_x] : memref<1x16xf8E8M0FNU> + %scale_b_loaded = vector.insert %scale_b_elem, %scale_b_undef[%c0] : f8E8M0FNU into vector<1xf8E8M0FNU> + + %c_result = xegpu.dpas_mx %a_trunc, %b_trunc, %c_loaded scale_a = %scale_a_loaded scale_b = %scale_b_loaded : vector<16xf8E5M2>, vector<32xf8E5M2>, vector<8xf32>, vector<1xf8E8M0FNU>, vector<1xf8E8M0FNU> -> vector<8xf32> + + xegpu.store_nd %c_result, %tdesc_c[0, 0] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return + } + } + + func.func @test(%a : memref<8x32xf8E5M2>, %b : memref<32x16xf8E5M2>, %c : memref<8x16xf32>, %scale_a : memref<8x1xf8E8M0FNU>, %scale_b : memref<1x16xf8E8M0FNU>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + %memref_a = gpu.alloc() : memref<8x32xf8E5M2> + gpu.memcpy %memref_a, %a : memref<8x32xf8E5M2>, memref<8x32xf8E5M2> + + %memref_b = gpu.alloc() : memref<32x16xf8E5M2> + gpu.memcpy %memref_b, %b : memref<32x16xf8E5M2>, memref<32x16xf8E5M2> + + %memref_c = gpu.alloc() : memref<8x16xf32> + gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32> + + %memref_scale_a = gpu.alloc() : memref<8x1xf8E8M0FNU> + gpu.memcpy %memref_scale_a, %scale_a : memref<8x1xf8E8M0FNU>, memref<8x1xf8E8M0FNU> + + %memref_scale_b = gpu.alloc() : memref<1x16xf8E8M0FNU> + gpu.memcpy %memref_scale_b, %scale_b : memref<1x16xf8E8M0FNU>, memref<1x16xf8E8M0FNU> + + gpu.launch_func @kernel::@dpas_mx_bf8 blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) + args(%memref_a : memref<8x32xf8E5M2>, %memref_b : memref<32x16xf8E5M2>, %memref_c : memref<8x16xf32>, %memref_scale_a : memref<8x1xf8E8M0FNU>, %memref_scale_b : memref<1x16xf8E8M0FNU>) + gpu.dealloc %memref_a : memref<8x32xf8E5M2> + gpu.dealloc %memref_b : memref<32x16xf8E5M2> + gpu.dealloc %memref_scale_a : memref<8x1xf8E8M0FNU> + gpu.dealloc %memref_scale_b : memref<1x16xf8E8M0FNU> + %res = memref.alloc() : memref<8x16xf32> + gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32> + gpu.dealloc %memref_c : memref<8x16xf32> + return %res : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c1bf8 = arith.constant 1.0 : f8E5M2 + %c0f32 = arith.constant 0.0 : f32 + %c1f8E8M0FNU = arith.constant 1.0 : f8E8M0FNU + + %A = memref.alloc() : memref<8x32xf8E5M2> + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c32 step %c1 { + memref.store %c1bf8, %A[%i, %j] : memref<8x32xf8E5M2> + } + } + + %B = memref.alloc() : memref<32x16xf8E5M2> + scf.for %i = %c0 to %c32 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c1bf8, %B[%i, %j] : memref<32x16xf8E5M2> + } + } + + %C = memref.alloc() : memref<8x16xf32> + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c0f32, %C[%i, %j] : memref<8x16xf32> + } + } + + %scale_A = memref.alloc() : memref<8x1xf8E8M0FNU> + scf.for %i = %c0 to %c8 step %c1 { + memref.store %c1f8E8M0FNU, %scale_A[%i, %c0] : memref<8x1xf8E8M0FNU> + } + + %scale_B = memref.alloc() : memref<1x16xf8E8M0FNU> + scf.for %i = %c0 to %c16 step %c1 { + memref.store %c1f8E8M0FNU, %scale_B[%c0, %i] : memref<1x16xf8E8M0FNU> + } + + %C_res = call @test(%A, %B, %C, %scale_A, %scale_B) : (memref<8x32xf8E5M2>, memref<32x16xf8E5M2>, memref<8x16xf32>, memref<8x1xf8E8M0FNU>, memref<1x16xf8E8M0FNU>) -> memref<8x16xf32> + %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%C_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-COUNT-8: [32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32, 32] + memref.dealloc %A : memref<8x32xf8E5M2> + memref.dealloc %B : memref<32x16xf8E5M2> + memref.dealloc %C : memref<8x16xf32> + memref.dealloc %scale_A : memref<8x1xf8E8M0FNU> + memref.dealloc %scale_B : memref<1x16xf8E8M0FNU> + memref.dealloc %C_res : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface } + +} diff --git a/test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_e2m1.mlir b/test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_e2m1.mlir new file mode 100644 index 000000000..5b0ddf0a9 --- /dev/null +++ b/test/Integration/Dialect/XeGPU/CRI/SIMT/xegpu_dpas_mx_prepacked_e2m1.mlir @@ -0,0 +1,153 @@ +// RUN: mlir-opt %s --gpu-lower-to-xevm-pipeline="xegpu-op-level=lane zebin-chip=cri" \ +// RUN: | mlir-runner \ +// RUN: --shared-libs=%mlir_levelzero_runtime \ +// RUN: --shared-libs=%mlir_runner_utils \ +// RUN: --shared-libs=%mlir_c_runner_utils \ +// RUN: --entry-point-result=void \ +// RUN: | FileCheck %s + +module @gemm attributes {gpu.container_module} { + gpu.module @kernel { + gpu.func @dpas_mx_e2m1(%a: memref<8x64xf4E2M1FN>, %b: memref<32x16xi8>, %c: memref<8x16xf32>, %scale_a: memref<8x2xf8E8M0FNU>, %scale_b: memref<2x16xf8E8M0FNU>) kernel { + + %tdesc_a = xegpu.create_nd_tdesc %a : memref<8x64xf4E2M1FN> -> !xegpu.tensor_desc<8x64xf4E2M1FN> + %a_trunc = xegpu.load_nd %tdesc_a[0, 0] : !xegpu.tensor_desc<8x64xf4E2M1FN> -> vector<32xf4E2M1FN> + + %tdesc_b = xegpu.create_nd_tdesc %b : memref<32x16xi8> -> !xegpu.tensor_desc<32x16xi8> + %b_loaded = xegpu.load_nd %tdesc_b[0, 0] <{packed}> : !xegpu.tensor_desc<32x16xi8> -> vector<32xi8> + %b_trunc = vector.bitcast %b_loaded : vector<32xi8> to vector<64xf4E2M1FN> + + %tdesc_c = xegpu.create_nd_tdesc %c : memref<8x16xf32> -> !xegpu.tensor_desc<8x16xf32> + %c_loaded = xegpu.load_nd %tdesc_c[0, 0] : !xegpu.tensor_desc<8x16xf32> -> vector<8xf32> + + // How to represent loaded scale_a in lane level? + // Not enough columns for scale_a to lane distribute. + // Option 1: Use load_nd + // load_nd of tile size MxK where, K is too small and element type is f8E8M0FNU, + // has a special sematics to load K element vector per lane for the first M lanes, + // and fill 0 for the rest lanes. This is to support the common usage of scale_a in DPAS MX + // which has M scale factor vectors for M rows of A. + //%tdesc_scale_a = xegpu.create_nd_tdesc %scale_a : memref<8x2xf8E8M0FNU> -> !xegpu.tensor_desc<8x2xf8E8M0FNU> + //%scale_a_loaded = xegpu.load_nd %tdesc_scale_a[0, 0] : !xegpu.tensor_desc<8x2xf8E8M0FNU> -> vector<2xf8E8M0FNU> + // Option 2: Use load + insert + %id_x = gpu.thread_id x + %c8 = arith.constant 8 : index + %idx_x = arith.remsi %id_x, %c8 : index + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %first = memref.load %scale_a[%idx_x, %c0] : memref<8x2xf8E8M0FNU> + %second = memref.load %scale_a[%idx_x, %c1] : memref<8x2xf8E8M0FNU> + %scale_a_undef = arith.constant dense<1.0> : vector<2xf8E8M0FNU> + %scale_a_tmp = vector.insert %first, %scale_a_undef[%c0] : f8E8M0FNU into vector<2xf8E8M0FNU> + %scale_a_loaded = vector.insert %second, %scale_a_tmp[%c1] : f8E8M0FNU into vector<2xf8E8M0FNU> + + // scale_b cannot use 2d block load. 8bit load with 16 columns is not supported + %scale_b_undef = arith.constant dense<1.0> : vector<2xf8E8M0FNU> + %first_b = memref.load %scale_b[%c0, %id_x] : memref<2x16xf8E8M0FNU> + %second_b = memref.load %scale_b[%c1, %id_x] : memref<2x16xf8E8M0FNU> + %scale_b_tmp = vector.insert %first_b, %scale_b_undef[%c0] : f8E8M0FNU into vector<2xf8E8M0FNU> + %scale_b_loaded = vector.insert %second_b, %scale_b_tmp[%c1] : f8E8M0FNU into vector<2xf8E8M0FNU> + + %c_result = xegpu.dpas_mx %a_trunc, %b_trunc, %c_loaded scale_a = %scale_a_loaded scale_b = %scale_b_loaded : vector<32xf4E2M1FN>, vector<64xf4E2M1FN>, vector<8xf32>, vector<2xf8E8M0FNU>, vector<2xf8E8M0FNU> -> vector<8xf32> + + xegpu.store_nd %c_result, %tdesc_c[0, 0] : vector<8xf32>, !xegpu.tensor_desc<8x16xf32> + gpu.return + } + } + + func.func @test(%a : memref<8x64xf4E2M1FN>, %b : memref<32x16xi8>, %c : memref<8x16xf32>, %scale_a : memref<8x2xf8E8M0FNU>, %scale_b : memref<2x16xf8E8M0FNU>) -> memref<8x16xf32> attributes {llvm.emit_c_interface} { + %c1 = arith.constant 1 : index + %c16 = arith.constant 16 : index + + %memref_a = gpu.alloc() : memref<8x64xf4E2M1FN> + gpu.memcpy %memref_a, %a : memref<8x64xf4E2M1FN>, memref<8x64xf4E2M1FN> + + %memref_b = gpu.alloc() : memref<32x16xi8> + gpu.memcpy %memref_b, %b : memref<32x16xi8>, memref<32x16xi8> + + %memref_c = gpu.alloc() : memref<8x16xf32> + gpu.memcpy %memref_c, %c : memref<8x16xf32>, memref<8x16xf32> + + %memref_scale_a = gpu.alloc() : memref<8x2xf8E8M0FNU> + gpu.memcpy %memref_scale_a, %scale_a : memref<8x2xf8E8M0FNU>, memref<8x2xf8E8M0FNU> + + %memref_scale_b = gpu.alloc() : memref<2x16xf8E8M0FNU> + gpu.memcpy %memref_scale_b, %scale_b : memref<2x16xf8E8M0FNU>, memref<2x16xf8E8M0FNU> + + gpu.launch_func @kernel::@dpas_mx_e2m1 blocks in (%c1, %c1, %c1) threads in (%c16, %c1, %c1) + args(%memref_a : memref<8x64xf4E2M1FN>, %memref_b : memref<32x16xi8>, %memref_c : memref<8x16xf32>, %memref_scale_a : memref<8x2xf8E8M0FNU>, %memref_scale_b : memref<2x16xf8E8M0FNU>) + gpu.dealloc %memref_a : memref<8x64xf4E2M1FN> + gpu.dealloc %memref_b : memref<32x16xi8> + gpu.dealloc %memref_scale_a : memref<8x2xf8E8M0FNU> + gpu.dealloc %memref_scale_b : memref<2x16xf8E8M0FNU> + %res = memref.alloc() : memref<8x16xf32> + gpu.memcpy %res, %memref_c : memref<8x16xf32>, memref<8x16xf32> + gpu.dealloc %memref_c : memref<8x16xf32> + return %res : memref<8x16xf32> + } + + func.func @main() attributes {llvm.emit_c_interface} { + + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %c8 = arith.constant 8 : index + %c16 = arith.constant 16 : index + %c32 = arith.constant 32 : index + %c256 = arith.constant 256 : index + %c1e2m1 = arith.constant 1.0 : f4E2M1FN + %c1packed_e2m1 = arith.constant 0x22 : i8 + %c0f32 = arith.constant 0.0 : f32 + %c1f8E8M0FNU = arith.constant 1.0 : f8E8M0FNU + + %A_flatbytes = memref.alloc() : memref<256xi8> + %A = memref.view %A_flatbytes[%c0][] : memref<256xi8> to memref<8x64xf4E2M1FN> + scf.for %i = %c0 to %c256 step %c1 { + memref.store %c1packed_e2m1, %A_flatbytes[%i] : memref<256xi8> + } + + %B = memref.alloc() : memref<32x16xi8> + scf.for %i = %c0 to %c32 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c1packed_e2m1, %B[%i, %j] : memref<32x16xi8> + } + } + + %C = memref.alloc() : memref<8x16xf32> + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c0f32, %C[%i, %j] : memref<8x16xf32> + } + } + + %scale_A = memref.alloc() : memref<8x2xf8E8M0FNU> + scf.for %i = %c0 to %c8 step %c1 { + scf.for %j = %c0 to %c2 step %c1 { + memref.store %c1f8E8M0FNU, %scale_A[%i, %j] : memref<8x2xf8E8M0FNU> + } + } + + %scale_B = memref.alloc() : memref<2x16xf8E8M0FNU> + scf.for %i = %c0 to %c2 step %c1 { + scf.for %j = %c0 to %c16 step %c1 { + memref.store %c1f8E8M0FNU, %scale_B[%i, %j] : memref<2x16xf8E8M0FNU> + } + } + + %C_res = call @test(%A, %B, %C, %scale_A, %scale_B) : (memref<8x64xf4E2M1FN>, memref<32x16xi8>, memref<8x16xf32>, memref<8x2xf8E8M0FNU>, memref<2x16xf8E8M0FNU>) -> memref<8x16xf32> + %C_cast = memref.cast %C_res : memref<8x16xf32> to memref<*xf32> + call @printMemrefF32(%C_cast) : (memref<*xf32>) -> () + + // CHECK: Unranked Memref base@ = 0x{{[0-9a-f]+}} + // CHECK-COUNT-8: [64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64] + memref.dealloc %A_flatbytes : memref<256xi8> + memref.dealloc %B : memref<32x16xi8> + memref.dealloc %C : memref<8x16xf32> + memref.dealloc %scale_A : memref<8x2xf8E8M0FNU> + memref.dealloc %scale_B : memref<2x16xf8E8M0FNU> + memref.dealloc %C_res : memref<8x16xf32> + return + } + func.func private @printMemrefF32(%ptr : memref<*xf32>) attributes { llvm.emit_c_interface } + +} diff --git a/test/Integration/Dialect/XeGPU/CRI/lit.local.cfg b/test/Integration/Dialect/XeGPU/CRI/lit.local.cfg new file mode 100644 index 000000000..8c6b0441a --- /dev/null +++ b/test/Integration/Dialect/XeGPU/CRI/lit.local.cfg @@ -0,0 +1,2 @@ +if(not config.mlir_enable_levelzero_runtime or not config.mlir_enable_spirv_backend): + config.unsupported = True