groxaxo · groxaxo · Apr 29, 2026 · Apr 28, 2026 · Apr 28, 2026 · Apr 29, 2026
diff --git a/gptqmodel/models/base.py b/gptqmodel/models/base.py
@@ -702,7 +702,9 @@ def quantize(
 
         if self.quantize_config.method in QUANTIZE_BLACK_LIST:
             raise ValueError(
-                f"Unsupported quantization operation for quant method: {self.quantize_config.method}"
+                f"Quantization method `{self.quantize_config.method}` is not supported in this repository. "
+                f"Only GPTQ quantization (METHOD.GPTQ) is allowed. "
+                f"Please use `GPTQConfig` or `QuantizeConfig` with `method=METHOD.GPTQ`."
             )
 
         if not self.support_batch_quantize:

diff --git a/gptqmodel/nn_modules/qlinear/gptq_pro.py b/gptqmodel/nn_modules/qlinear/gptq_pro.py
@@ -24,7 +24,11 @@
 class GptqProQuantLinear(PackableQuantLinear):
     SUPPORTS_BACKENDS = [BACKEND.GPTQ_PRO]
     SUPPORTS_METHODS = [METHOD.GPTQ]
-    SUPPORTS_FORMATS = {FORMAT.GPTQ: 0, FORMAT.GPTQ_V2: 0}
+    # Priority 95 (above Marlin=90) so GPTQ-Pro is the first kernel tried on
+    # Ampere for symmetric 4-bit FP16 GPTQ without desc_act.  On pre-Ampere
+    # GPUs validate_device() will fail the sm_80 check and the selector falls
+    # through to Marlin automatically.
+    SUPPORTS_FORMATS = {FORMAT.GPTQ: 95, FORMAT.GPTQ_V2: 95}
     SUPPORTS_BITS = [4]
     SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128, 256, 512, 1024]
     SUPPORTS_DESC_ACT = [False]

diff --git a/gptqmodel/quantization/config.py b/gptqmodel/quantization/config.py
@@ -1465,8 +1465,20 @@ def to_dict(self) -> Dict[str, Any]:
     FORMAT.QQQ: METHOD.QQQ,
 }
 
-# inference only methods should go here
-QUANTIZE_BLACK_LIST = {}
+# Non-GPTQ methods are blocked from the quantize() flow.
+# GPTQ (and its format variants GPTQ_V2, MARLIN, BITBLAS) is the only
+# supported quantization algorithm in this repository.  Attempting to
+# quantize with any other method will raise a clear error at the start
+# of quantize() in BaseQModel.
+QUANTIZE_BLACK_LIST = {
+    METHOD.GGUF,
+    METHOD.FP8,
+    METHOD.BITSANDBYTES,
+    METHOD.QQQ,
+    METHOD.AWQ,
+    METHOD.EXL3,
+    METHOD.PARO,
+}
 
 # compat
 QUANT_CONFIG_ARG_SYNONYMS = {

diff --git a/gptqmodel/utils/gptq_pro.py b/gptqmodel/utils/gptq_pro.py
@@ -72,6 +72,18 @@ def _build_gptq_pro_extension(verbose: bool):
             "-lineinfo",
             "-U__CUDA_NO_HALF_OPERATORS__",
             "-U__CUDA_NO_HALF_CONVERSIONS__",
+            # Target all Ampere SM variants with native SASS cubins:
+            #   sm_80 — A100, A30, GA100 (data-centre Ampere)
+            #   sm_86 — RTX 3090/3080/A6000, GA102/GA104/GA106 (consumer + pro Ampere)
+            #   sm_87 — Jetson Orin / embedded Ampere
+            "-gencode arch=compute_80,code=sm_80",
+            "-gencode arch=compute_86,code=sm_86",
+            "-gencode arch=compute_87,code=sm_87",
+            # Embed sm_87 PTX as a forward-compatible fallback so the kernel can
+            # also be loaded on post-Ampere devices (Ada sm_89, Hopper sm_90, …)
+            # that pass the major >= 8 capability check.  The CUDA driver will
+            # JIT-compile the PTX to native code on first use for those GPUs.
+            "-gencode arch=compute_87,code=compute_87",
         ],
         build_directory=build_directory,
         verbose=verbose,