Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion gptqmodel/models/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -702,7 +702,9 @@ def quantize(

if self.quantize_config.method in QUANTIZE_BLACK_LIST:
raise ValueError(
f"Unsupported quantization operation for quant method: {self.quantize_config.method}"
f"Quantization method `{self.quantize_config.method}` is not supported in this repository. "
f"Only GPTQ quantization (METHOD.GPTQ) is allowed. "
f"Please use `GPTQConfig` or `QuantizeConfig` with `method=METHOD.GPTQ`."
)

if not self.support_batch_quantize:
Expand Down
6 changes: 5 additions & 1 deletion gptqmodel/nn_modules/qlinear/gptq_pro.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,11 @@
class GptqProQuantLinear(PackableQuantLinear):
SUPPORTS_BACKENDS = [BACKEND.GPTQ_PRO]
SUPPORTS_METHODS = [METHOD.GPTQ]
SUPPORTS_FORMATS = {FORMAT.GPTQ: 0, FORMAT.GPTQ_V2: 0}
# Priority 95 (above Marlin=90) so GPTQ-Pro is the first kernel tried on
# Ampere for symmetric 4-bit FP16 GPTQ without desc_act. On pre-Ampere
# GPUs validate_device() will fail the sm_80 check and the selector falls
# through to Marlin automatically.
SUPPORTS_FORMATS = {FORMAT.GPTQ: 95, FORMAT.GPTQ_V2: 95}
SUPPORTS_BITS = [4]
SUPPORTS_GROUP_SIZE = [-1, 16, 32, 64, 128, 256, 512, 1024]
SUPPORTS_DESC_ACT = [False]
Expand Down
16 changes: 14 additions & 2 deletions gptqmodel/quantization/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -1465,8 +1465,20 @@ def to_dict(self) -> Dict[str, Any]:
FORMAT.QQQ: METHOD.QQQ,
}

# inference only methods should go here
QUANTIZE_BLACK_LIST = {}
# Non-GPTQ methods are blocked from the quantize() flow.
# GPTQ (and its format variants GPTQ_V2, MARLIN, BITBLAS) is the only
# supported quantization algorithm in this repository. Attempting to
# quantize with any other method will raise a clear error at the start
# of quantize() in BaseQModel.
QUANTIZE_BLACK_LIST = {
METHOD.GGUF,
METHOD.FP8,
METHOD.BITSANDBYTES,
METHOD.QQQ,
METHOD.AWQ,
METHOD.EXL3,
METHOD.PARO,
}

# compat
QUANT_CONFIG_ARG_SYNONYMS = {
Expand Down
12 changes: 12 additions & 0 deletions gptqmodel/utils/gptq_pro.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,18 @@ def _build_gptq_pro_extension(verbose: bool):
"-lineinfo",
"-U__CUDA_NO_HALF_OPERATORS__",
"-U__CUDA_NO_HALF_CONVERSIONS__",
# Target all Ampere SM variants with native SASS cubins:
# sm_80 — A100, A30, GA100 (data-centre Ampere)
# sm_86 — RTX 3090/3080/A6000, GA102/GA104/GA106 (consumer + pro Ampere)
# sm_87 — Jetson Orin / embedded Ampere
"-gencode arch=compute_80,code=sm_80",
"-gencode arch=compute_86,code=sm_86",
"-gencode arch=compute_87,code=sm_87",
# Embed sm_87 PTX as a forward-compatible fallback so the kernel can
# also be loaded on post-Ampere devices (Ada sm_89, Hopper sm_90, …)
# that pass the major >= 8 capability check. The CUDA driver will
# JIT-compile the PTX to native code on first use for those GPUs.
"-gencode arch=compute_87,code=compute_87",
],
build_directory=build_directory,
verbose=verbose,
Expand Down