From a81fc2046ee598ac82e69b70d536972769d0b7b6 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 03:58:36 +0000
Subject: [PATCH 1/3] feat: restrict quantization to GPTQ-only via
 QUANTIZE_BLACK_LIST

Agent-Logs-Url: https://github.com/groxaxo/GPTQ-Pro/sessions/09c93bdb-a0ca-4654-a385-c74bbab8ce98

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 gptqmodel/models/base.py         |  4 +++-
 gptqmodel/quantization/config.py | 16 ++++++++++++++--
 2 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
index 53a9d4133..a2de6f9c5 100644
--- a/gptqmodel/models/base.py
+++ b/gptqmodel/models/base.py
@@ -702,7 +702,9 @@ def quantize(
 
         if self.quantize_config.method in QUANTIZE_BLACK_LIST:
             raise ValueError(
-                f"Unsupported quantization operation for quant method: {self.quantize_config.method}"
+                f"Quantization method `{self.quantize_config.method}` is not supported in this repository. "
+                f"Only GPTQ quantization (METHOD.GPTQ) is allowed. "
+                f"Please use `GPTQConfig` or `QuantizeConfig` with `method=METHOD.GPTQ`."
             )
 
         if not self.support_batch_quantize:
diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
index 0a2091fea..2c2902086 100644
--- a/gptqmodel/quantization/config.py
+++ b/gptqmodel/quantization/config.py
@@ -1465,8 +1465,20 @@ def to_dict(self) -> Dict[str, Any]:
     FORMAT.QQQ: METHOD.QQQ,
 }
 
-# inference only methods should go here
-QUANTIZE_BLACK_LIST = {}
+# Non-GPTQ methods are blocked from the quantize() flow.
+# GPTQ (and its format variants GPTQ_V2, MARLIN, BITBLAS) is the only
+# supported quantization algorithm in this repository.  Attempting to
+# quantize with any other method will raise a clear error at the start
+# of quantize() in BaseQModel.
+QUANTIZE_BLACK_LIST = {
+    METHOD.GGUF,
+    METHOD.FP8,
+    METHOD.BITSANDBYTES,
+    METHOD.QQQ,
+    METHOD.AWQ,
+    METHOD.EXL3,
+    METHOD.PARO,
+}
 
 # compat
 QUANT_CONFIG_ARG_SYNONYMS = {

From 39e5aaebe43b688bdfc78a65eaf38457983b66cf Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Tue, 28 Apr 2026 06:06:15 +0000
Subject: [PATCH 2/3] feat: align GPTQ-Pro kernel with Ampere GPUs (priority +
 sm_80/86/87 build flags)

Agent-Logs-Url: https://github.com/groxaxo/GPTQ-Pro/sessions/e4d78097-be29-48bb-8f75-d71ac6547cc9

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 gptqmodel/nn_modules/qlinear/gptq_pro.py | 6 +++++-
 gptqmodel/utils/gptq_pro.py              | 9 +++++++++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/gptqmodel/nn_modules/qlinear/gptq_pro.py b/gptqmodel/nn_modules/qlinear/gptq_pro.py
index bbea5a7f7..14f4f70af 100644
--- a/gptqmodel/nn_modules/qlinear/gptq_pro.py
+++ b/gptqmodel/nn_modules/qlinear/gptq_pro.py
@@ -24,7 +24,11 @@
 class GptqProQuantLinear(PackableQuantLinear):
     SUPPORTS_BACKENDS = [BACKEND.GPTQ_PRO]
     SUPPORTS_METHODS = [METHOD.GPTQ]
-    SUPPORTS_FORMATS = {FORMAT.GPTQ: 0, FORMAT.GPTQ_V2: 0}
+    # Priority 95 (above Marlin=90) so GPTQ-Pro is the first kernel tried on
+    # Ampere for symmetric 4-bit FP16 GPTQ without desc_act.  On pre-Ampere
+    # GPUs validate_device() will fail the sm_80 check and the selector falls
+    # through to Marlin automatically.
+    SUPPORTS_FORMATS = {FORMAT.GPTQ: 95, FORMAT.GPTQ_V2: 95}
     SUPPORTS_BITS = [4]
     SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128, 256, 512, 1024]
     SUPPORTS_DESC_ACT = [False]
diff --git a/gptqmodel/utils/gptq_pro.py b/gptqmodel/utils/gptq_pro.py
index 7debeb3c4..317260651 100644
--- a/gptqmodel/utils/gptq_pro.py
+++ b/gptqmodel/utils/gptq_pro.py
@@ -72,6 +72,15 @@ def _build_gptq_pro_extension(verbose: bool):
             "-lineinfo",
             "-U__CUDA_NO_HALF_OPERATORS__",
             "-U__CUDA_NO_HALF_CONVERSIONS__",
+            # Target all Ampere SM variants:
+            #   sm_80 — A100, A30, GA100 (data-centre Ampere)
+            #   sm_86 — RTX 3090/3080/A6000, GA102/GA104/GA106 (consumer + pro Ampere)
+            #   sm_87 — Jetson Orin / embedded Ampere
+            # Using -gencode for each ensures native SASS for each sub-arch
+            # instead of relying on JIT re-compilation of generic PTX at load time.
+            "-gencode arch=compute_80,code=sm_80",
+            "-gencode arch=compute_86,code=sm_86",
+            "-gencode arch=compute_87,code=sm_87",
         ],
         build_directory=build_directory,
         verbose=verbose,

From 0457356b1d436aa6e983be1bd52f32d12f04117a Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 29 Apr 2026 04:22:41 +0000
Subject: [PATCH 3/3] fix: add PTX fallback gencode for post-Ampere forward
 compatibility

Agent-Logs-Url: https://github.com/groxaxo/GPTQ-Pro/sessions/647ddaba-a7c7-4078-a93e-f5d23bda896e

Co-authored-by: groxaxo <76023196+groxaxo@users.noreply.github.com>
---
 gptqmodel/utils/gptq_pro.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/gptqmodel/utils/gptq_pro.py b/gptqmodel/utils/gptq_pro.py
index 317260651..c97293601 100644
--- a/gptqmodel/utils/gptq_pro.py
+++ b/gptqmodel/utils/gptq_pro.py
@@ -72,15 +72,18 @@ def _build_gptq_pro_extension(verbose: bool):
             "-lineinfo",
             "-U__CUDA_NO_HALF_OPERATORS__",
             "-U__CUDA_NO_HALF_CONVERSIONS__",
-            # Target all Ampere SM variants:
+            # Target all Ampere SM variants with native SASS cubins:
             #   sm_80 — A100, A30, GA100 (data-centre Ampere)
             #   sm_86 — RTX 3090/3080/A6000, GA102/GA104/GA106 (consumer + pro Ampere)
             #   sm_87 — Jetson Orin / embedded Ampere
-            # Using -gencode for each ensures native SASS for each sub-arch
-            # instead of relying on JIT re-compilation of generic PTX at load time.
             "-gencode arch=compute_80,code=sm_80",
             "-gencode arch=compute_86,code=sm_86",
             "-gencode arch=compute_87,code=sm_87",
+            # Embed sm_87 PTX as a forward-compatible fallback so the kernel can
+            # also be loaded on post-Ampere devices (Ada sm_89, Hopper sm_90, …)
+            # that pass the major >= 8 capability check.  The CUDA driver will
+            # JIT-compile the PTX to native code on first use for those GPUs.
+            "-gencode arch=compute_87,code=compute_87",
         ],
         build_directory=build_directory,
         verbose=verbose,