Ggml dequant#2
Conversation
There was a problem hiding this comment.
Code Review
This pull request implements XPU-based GGML dequantization for Q4_0, Q5_0, and Q8_0 formats, including SYCL kernels and Torch bindings. Feedback focuses on improving the robustness of divisibility checks by using type-specific block sizes, optimizing kernel performance by reducing redundant calculations and improving memory coalescing, simplifying bit extraction logic in the Q5_0 kernel, and removing a large block of commented-out code in the test suite.
| numel % ggml::QK4_0 == 0, | ||
| ggml::ggml_type_name(type), | ||
| " dequantize expects m * n to be divisible by ", | ||
| ggml::QK4_0, | ||
| ", got ", | ||
| numel); | ||
|
|
There was a problem hiding this comment.
The divisibility check currently hardcodes ggml::QK4_0 (32) for all quantization types. While all currently supported types have a block size of 32, it is better to use the type-specific block size constant for correctness and future-proofing.
const int64_t qk = (type == ggml::GGML_TYPE_Q8_0) ? ggml::QK8_0 :
(type == ggml::GGML_TYPE_Q5_0) ? ggml::QK5_0 : ggml::QK4_0;
TORCH_CHECK(
numel % qk == 0,
ggml::ggml_type_name(type),
" dequantize expects m * n to be divisible by ",
qk,
", got ",
numel);| const int64_t i = index[0]; | ||
| if (i >= numel_) { | ||
| return; | ||
| } | ||
|
|
||
| const int64_t block_index = i / QK4_0; | ||
| const int64_t block_offset = i % QK4_0; | ||
| const block_q4_0& block = blocks_[block_index]; | ||
| const bool is_high_half = block_offset >= (QK4_0 / 2); | ||
| const int64_t quant_index = | ||
| is_high_half ? (block_offset - QK4_0 / 2) : block_offset; | ||
| const uint8_t packed = block.qs[quant_index]; | ||
| const int quant = is_high_half ? (packed >> 4) : (packed & 0x0F); | ||
| const float value = | ||
| (static_cast<float>(quant) - 8.0f) * static_cast<float>(block.d); | ||
| out_[i] = static_cast<sycl_t>(value); | ||
| } |
There was a problem hiding this comment.
Launching one work item per element and performing integer division/modulo (i / QK4_0, i % QK4_0) inside the kernel is suboptimal for GPU performance. Additionally, this approach leads to redundant loads of the block scale (block.d) and packed weights. Consider processing one block per subgroup or work-group to improve memory coalescing and reduce redundant calculations.
| const int xh = is_high_half ? ((qh >> (quant_index + 12)) & 0x10) | ||
| : (((qh >> quant_index) << 4) & 0x10); |
There was a problem hiding this comment.
| """ | ||
| @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) | ||
| @pytest.mark.parametrize("dtype", DTYPES) | ||
| @pytest.mark.parametrize("quant_type", QUANT_TYPES) | ||
| @torch.inference_mode() | ||
| def test_mmvq(hidden_size: int, dtype: torch.dtype, | ||
| quant_type: GGMLQuantizationType): | ||
| set_random_seed(0) | ||
|
|
||
| tensors = get_gguf_sample_tensors(hidden_size, quant_type) | ||
| x = torch.rand((1, hidden_size), dtype=dtype, device="cuda") | ||
| for tensor in tensors: | ||
| weight = torch.tensor(dequantize(tensor.data, quant_type), | ||
| device="cuda").to( | ||
| dtype | ||
| ) | ||
| ref_output = x @ weight.T | ||
|
|
||
| qweight = torch.tensor(tensor.data, device="cuda") | ||
| output = ops.ggml_mul_mat_vec_a8(qweight, x, quant_type, | ||
| qweight.shape[0]).to( | ||
| dtype | ||
| ) | ||
|
|
||
| torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1) | ||
|
|
||
| @pytest.mark.parametrize("num_tokens", NUM_TOKENS) | ||
| @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES) | ||
| @pytest.mark.parametrize("dtype", DTYPES) | ||
| @pytest.mark.parametrize( | ||
| "quant_type", | ||
| [ | ||
| # k-quants | ||
| GGMLQuantizationType.Q2_K, | ||
| GGMLQuantizationType.Q3_K, | ||
| GGMLQuantizationType.Q4_K, | ||
| GGMLQuantizationType.Q5_K, | ||
| GGMLQuantizationType.Q6_K, | ||
| # standard quants | ||
| GGMLQuantizationType.Q4_0, | ||
| GGMLQuantizationType.Q5_0, | ||
| GGMLQuantizationType.Q8_0, | ||
| ], | ||
| ) | ||
| @torch.inference_mode() | ||
| def test_mmq( | ||
| num_tokens: int, | ||
| hidden_size: int, | ||
| dtype: torch.dtype, | ||
| quant_type: GGMLQuantizationType, | ||
| ): | ||
| set_random_seed(0) | ||
|
|
||
| tensors = get_gguf_sample_tensors(hidden_size, quant_type) | ||
| x = torch.rand((num_tokens, hidden_size), dtype=dtype, device="cuda") | ||
| for tensor in tensors: | ||
| weight = torch.tensor(dequantize(tensor.data, quant_type), | ||
| device="cuda").to( | ||
| dtype | ||
| ) | ||
| ref_output = x @ weight.T | ||
|
|
||
| qweight = torch.tensor(tensor.data, device="cuda") | ||
| output = ops.ggml_mul_mat_a8(qweight, x, quant_type, qweight.shape[0]) | ||
| atols = {torch.half: 1, torch.bfloat16: 1.5, torch.float: 1.2} | ||
| # test matrix has inputs centered around 0 and lower precision from | ||
| # bfloat16 tends to accumulate and can greatly inflate rtol | ||
| # since outputs are also very close to 0 | ||
| rtols = {torch.half: 1e-1, torch.bfloat16: 1e4, torch.float: 2e1} | ||
| torch.testing.assert_close( | ||
| output, ref_output, atol=atols[dtype], rtol=rtols[dtype] | ||
| ) | ||
|
|
||
| @pytest.mark.parametrize("num_tokens", NUM_TOKENS) | ||
| @pytest.mark.parametrize("hidden_size", [512]) | ||
| @pytest.mark.parametrize("top_k", [4, 8]) | ||
| @pytest.mark.parametrize("dtype", DTYPES) | ||
| @pytest.mark.parametrize("quant_type", QUANT_TYPES) | ||
| @torch.inference_mode() | ||
| def test_moe( | ||
| num_tokens: int, | ||
| hidden_size: int, | ||
| dtype: torch.dtype, | ||
| quant_type: GGMLQuantizationType, | ||
| top_k: int, | ||
| ): | ||
| set_random_seed(0) | ||
| H, E = 1024, 256 | ||
|
|
||
| x = torch.rand((num_tokens, H), dtype=dtype, device="cuda") | ||
|
|
||
| topk_weights = torch.rand(num_tokens, top_k, device="cuda", dtype=dtype) | ||
| topk_ids = torch.randint( | ||
| 0, E, (num_tokens, top_k), device="cuda", dtype=torch.int32 | ||
| ) | ||
|
|
||
| tensors = get_gguf_MoE_tensors(hidden_size, quant_type) | ||
|
|
||
| w13 = tensors[0] | ||
| w2 = tensors[1] | ||
|
|
||
| w13_dequant = torch.tensor(dequantize(w13.data, quant_type), | ||
| device="cuda").to( | ||
| dtype | ||
| ) | ||
|
|
||
| w2_dequant = torch.tensor(dequantize(w2.data, quant_type), | ||
| device="cuda").to(dtype) | ||
|
|
||
| output = _fused_moe_gguf( | ||
| x, | ||
| torch.tensor(w13.data, device="cuda"), | ||
| torch.tensor(w2.data, device="cuda"), | ||
| topk_weights, | ||
| topk_ids, | ||
| quant_type, | ||
| quant_type, | ||
| "silu", | ||
| ) | ||
|
|
||
| ref_output = fused_experts( | ||
| x, w13_dequant, w2_dequant, topk_weights, topk_ids | ||
| ).reshape(output.shape) | ||
| torch.testing.assert_close(output, ref_output, atol=1, rtol=1e-1) | ||
| """ |
…ect#165) * Support arbitrary KV cache strides in paged_decode for MLA - Remove CHECK_CONTIGUOUS for k/v in flash_api.cpp (stride(-1)==1 still enforced) - Add KV cache stride fields to paged_decode_args_t - Extract actual tensor strides in paged_decode_xe2.cpp - Use actual strides in DecodeKernelLauncher::initialize() instead of packed strides - Replace make_ordered_layout with make_layout using passed strides for K/V in kernel - Add test_decode_with_paged_kv_mla unit test with non-contiguous KV cache slices Signed-off-by: baodii <di.bao@intel.com> * make format happy Signed-off-by: baodii <di.bao@intel.com> * make format happy Signed-off-by: baodii <di.bao@intel.com> * Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> Signed-off-by: baodi <di.bao@intel.com> * add TORCH_CHECK to aviod potential bugs Signed-off-by: baodii <di.bao@intel.com> * fix kv stride bug Signed-off-by: baodii <di.bao@intel.com> * fix UT file Signed-off-by: baodii <di.bao@intel.com> --------- Signed-off-by: baodii <di.bao@intel.com> Signed-off-by: baodi <di.bao@intel.com> Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: zhenwei-intel <zhenwei.liu@intel.com>
6ef1e4f to
74657ec
Compare
Essential Elements of an Effective PR Description Checklist
supported_models.mdandexamplesfor a new model.PLEASE FILL IN THE PR DESCRIPTION HERE ENSURING ALL CHECKLIST ITEMS ABOVE HAVE BEEN CONSIDERED.
Purpose
Test Plan
Test Result
(Optional) Documentation Update
BEFORE SUBMITTING, PLEASE READ https://docs.vllm.ai/en/latest/contributing (anything written below this line will be removed by GitHub Actions)