From a81fc2046ee598ac82e69b70d536972769d0b7b6 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 03:58:36 +0000 Subject: [PATCH 1/3] feat: restrict quantization to GPTQ-only via QUANTIZE_BLACK_LIST Agent-Logs-Url: https://github.com/groxaxo/GPTQ-Pro/sessions/09c93bdb-a0ca-4654-a385-c74bbab8ce98 Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- gptqmodel/models/base.py | 4 +++- gptqmodel/quantization/config.py | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 3 deletions(-) diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py index 53a9d4133..a2de6f9c5 100644 --- a/gptqmodel/models/base.py +++ b/gptqmodel/models/base.py @@ -702,7 +702,9 @@ def quantize( if self.quantize_config.method in QUANTIZE_BLACK_LIST: raise ValueError( - f"Unsupported quantization operation for quant method: {self.quantize_config.method}" + f"Quantization method `{self.quantize_config.method}` is not supported in this repository. " + f"Only GPTQ quantization (METHOD.GPTQ) is allowed. " + f"Please use `GPTQConfig` or `QuantizeConfig` with `method=METHOD.GPTQ`." ) if not self.support_batch_quantize: diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py index 0a2091fea..2c2902086 100644 --- a/gptqmodel/quantization/config.py +++ b/gptqmodel/quantization/config.py @@ -1465,8 +1465,20 @@ def to_dict(self) -> Dict[str, Any]: FORMAT.QQQ: METHOD.QQQ, } -# inference only methods should go here -QUANTIZE_BLACK_LIST = {} +# Non-GPTQ methods are blocked from the quantize() flow. +# GPTQ (and its format variants GPTQ_V2, MARLIN, BITBLAS) is the only +# supported quantization algorithm in this repository. Attempting to +# quantize with any other method will raise a clear error at the start +# of quantize() in BaseQModel. +QUANTIZE_BLACK_LIST = { + METHOD.GGUF, + METHOD.FP8, + METHOD.BITSANDBYTES, + METHOD.QQQ, + METHOD.AWQ, + METHOD.EXL3, + METHOD.PARO, +} # compat QUANT_CONFIG_ARG_SYNONYMS = { From 39e5aaebe43b688bdfc78a65eaf38457983b66cf Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Tue, 28 Apr 2026 06:06:15 +0000 Subject: [PATCH 2/3] feat: align GPTQ-Pro kernel with Ampere GPUs (priority + sm_80/86/87 build flags) Agent-Logs-Url: https://github.com/groxaxo/GPTQ-Pro/sessions/e4d78097-be29-48bb-8f75-d71ac6547cc9 Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- gptqmodel/nn_modules/qlinear/gptq_pro.py | 6 +++++- gptqmodel/utils/gptq_pro.py | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/gptqmodel/nn_modules/qlinear/gptq_pro.py b/gptqmodel/nn_modules/qlinear/gptq_pro.py index bbea5a7f7..14f4f70af 100644 --- a/gptqmodel/nn_modules/qlinear/gptq_pro.py +++ b/gptqmodel/nn_modules/qlinear/gptq_pro.py @@ -24,7 +24,11 @@ class GptqProQuantLinear(PackableQuantLinear): SUPPORTS_BACKENDS = [BACKEND.GPTQ_PRO] SUPPORTS_METHODS = [METHOD.GPTQ] - SUPPORTS_FORMATS = {FORMAT.GPTQ: 0, FORMAT.GPTQ_V2: 0} + # Priority 95 (above Marlin=90) so GPTQ-Pro is the first kernel tried on + # Ampere for symmetric 4-bit FP16 GPTQ without desc_act. On pre-Ampere + # GPUs validate_device() will fail the sm_80 check and the selector falls + # through to Marlin automatically. + SUPPORTS_FORMATS = {FORMAT.GPTQ: 95, FORMAT.GPTQ_V2: 95} SUPPORTS_BITS = [4] SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128, 256, 512, 1024] SUPPORTS_DESC_ACT = [False] diff --git a/gptqmodel/utils/gptq_pro.py b/gptqmodel/utils/gptq_pro.py index 7debeb3c4..317260651 100644 --- a/gptqmodel/utils/gptq_pro.py +++ b/gptqmodel/utils/gptq_pro.py @@ -72,6 +72,15 @@ def _build_gptq_pro_extension(verbose: bool): "-lineinfo", "-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF_CONVERSIONS__", + # Target all Ampere SM variants: + # sm_80 — A100, A30, GA100 (data-centre Ampere) + # sm_86 — RTX 3090/3080/A6000, GA102/GA104/GA106 (consumer + pro Ampere) + # sm_87 — Jetson Orin / embedded Ampere + # Using -gencode for each ensures native SASS for each sub-arch + # instead of relying on JIT re-compilation of generic PTX at load time. + "-gencode arch=compute_80,code=sm_80", + "-gencode arch=compute_86,code=sm_86", + "-gencode arch=compute_87,code=sm_87", ], build_directory=build_directory, verbose=verbose, From 0457356b1d436aa6e983be1bd52f32d12f04117a Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 29 Apr 2026 04:22:41 +0000 Subject: [PATCH 3/3] fix: add PTX fallback gencode for post-Ampere forward compatibility Agent-Logs-Url: https://github.com/groxaxo/GPTQ-Pro/sessions/647ddaba-a7c7-4078-a93e-f5d23bda896e Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com> --- gptqmodel/utils/gptq_pro.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/gptqmodel/utils/gptq_pro.py b/gptqmodel/utils/gptq_pro.py index 317260651..c97293601 100644 --- a/gptqmodel/utils/gptq_pro.py +++ b/gptqmodel/utils/gptq_pro.py @@ -72,15 +72,18 @@ def _build_gptq_pro_extension(verbose: bool): "-lineinfo", "-U__CUDA_NO_HALF_OPERATORS__", "-U__CUDA_NO_HALF_CONVERSIONS__", - # Target all Ampere SM variants: + # Target all Ampere SM variants with native SASS cubins: # sm_80 — A100, A30, GA100 (data-centre Ampere) # sm_86 — RTX 3090/3080/A6000, GA102/GA104/GA106 (consumer + pro Ampere) # sm_87 — Jetson Orin / embedded Ampere - # Using -gencode for each ensures native SASS for each sub-arch - # instead of relying on JIT re-compilation of generic PTX at load time. "-gencode arch=compute_80,code=sm_80", "-gencode arch=compute_86,code=sm_86", "-gencode arch=compute_87,code=sm_87", + # Embed sm_87 PTX as a forward-compatible fallback so the kernel can + # also be loaded on post-Ampere devices (Ada sm_89, Hopper sm_90, …) + # that pass the major >= 8 capability check. The CUDA driver will + # JIT-compile the PTX to native code on first use for those GPUs. + "-gencode arch=compute_87,code=compute_87", ], build_directory=build_directory, verbose=verbose,