diff --git a/.gitmodules b/.gitmodules index 025fa65..45661ca 100644 --- a/.gitmodules +++ b/.gitmodules @@ -1,3 +1,12 @@ [submodule "contribs/quark"] path = contribs/quark url = https://github.com/amd/Quark.git +[submodule "contribs/llm-compressor"] + path = contribs/llm-compressor + url = https://github.com/vllm-project/llm-compressor.git +[submodule "contribs/transformers"] + path = contribs/transformers + url = https://github.com/huggingface/transformers.git +[submodule "contribs/vllm"] + path = contribs/vllm + url = https://github.com/vllm-project/vllm.git diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 0000000..0a0f52e --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,113 @@ +# CLAUDE.md + +This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository. + +## What is Quanto + +Quanto is an LLM quantization toolkit built on AMD Quark. It quantizes HuggingFace models to INT4/INT8/FP8/MXFP4/MXFP6 precisions with multiple memory strategies for different GPU constraints. Source code lives in `src/quanto/`. + +## Commands + +```bash +# Install +pip install -e ".[dev]" # dev (pytest, ruff) +pip install -e ".[nvidia]" # with NVIDIA extras +pip install -e ".[rocm]" # with ROCm extras + +# Tests (requires Quark — run on remote server with amd-quark installed) +pytest tests/ -v # all tests +pytest tests/test_unified_quantizer.py -v # single file +pytest tests/test_unified_quantizer.py::TestUnifiedConfig::test_default_config -v # single test + +# Lint & format +ruff check src/ # lint +ruff check src/ --fix # lint with autofix +ruff format src/ # format + +# Quantize a model (CLI) +python -m quanto \ + --model_path model/path \ + --output_dir ./output \ + --precision mxfp4 \ + --sensitivity_analysis \ + --sensitivity_threshold 0.12 + +# Quantize with explicit exclude list (e.g., attn-excl strategy) +python -m quanto \ + --model_path model/path \ + --output_dir ./output \ + --precision mxfp4 \ + --exclude_layers_file exclude.json + +# Quantize (Python API) +from quanto import UnifiedQuantizer, UnifiedConfig +config = UnifiedConfig( + model_path='model/path', output_dir='./output', + precision='mxfp4', sensitivity_analysis=True, + sensitivity_threshold=0.12, +) +UnifiedQuantizer(config).run() + +# Dequantize +python -m quanto --dequantize --model_path ./quantized --output_dir ./dequantized + +# Docker-based integration tests +./scripts/run_e2e_tests.sh rocm # all ROCm tests +./scripts/run_e2e_tests.sh cuda 1,2 # specific CUDA tests +``` + +## Architecture + +### Pipeline flow +`UnifiedConfig` (dataclass validation) -> `UnifiedQuantizer.run()` -> strategy dispatch -> `QuantizationResult` + +### Quantization paths + +**MXFP4/MXFP6** — Uses Quark's `quantize_model_per_safetensor` (file2file). Processes each safetensors shard independently without loading the full model. Produces packed uint8 weights + E8M0 scales compatible with vLLM's Quark loader. + +**INT4/INT8/FP8** — Uses in-memory quantization via `ModelQuantizer` + `export_safetensors`. Three memory strategies: +- `full` — entire model on GPU +- `layerwise_cpu` — model on CPU, layers quantized one-by-one on GPU +- `lazy` — weights loaded on-demand from safetensors + +### Core modules (`src/quanto/core/`) +- **`config.py`** — `UnifiedConfig` dataclass. Key fields: `precision`, `memory_strategy`, `algorithm` (rtn/awq/gptq), `sensitivity_analysis`, `sensitivity_threshold`, `exclude_layers`. +- **`unified_quantizer.py`** — Main quantizer. `run()` dispatches to `_run_file2file_quantization()` for MXFP or `_run_full_gpu_quantization()` / `_run_lazy_quantization()` for INT4/INT8. Contains `_determine_exclude_layers()` with sensitivity analysis and `_align_exclude_groups()` for vLLM fused layer compatibility. +- **`sensitivity/sequential_analyzer.py`** — Iterative sensitivity analysis. Scores each layer using the actual target precision (MXFP4 uses `OCP_MXFP4Spec`, not INT4 proxy). `_build_quant_config_for_scoring()` maps precision to the correct Quark spec class. + +### Supporting modules +- **`constants.py`** — `PRECISION_TO_SCHEME` mapping, `MODEL_TYPE_MAPPINGS` (includes `solar_open` -> `qwen3_moe`, `kimi_k2` -> `kimi_k25`), `SUPPORTED_ALGORITHMS`. +- **`auto_quantize.py`** — CLI `main()` entry point. Parses args and creates `UnifiedConfig`. Supports `--exclude_layers_file` for JSON exclude lists. +- **`utils/model_utils.py`** — `detect_model_type()` and `get_template()` for Quark `LLMTemplate` lookup. +- **`utils/calibration.py`** — `CalibrationDataManager` loads from HuggingFace datasets or local files. +- **`utils/int4_pack.py`** — INT4 <-> INT32 packing/unpacking. + +### External dependency +AMD Quark is vendored as a git submodule in `contribs/quark/`. Key Quark APIs used: +- `LLMTemplate.get_config(scheme, algorithm, exclude_layers)` — generates per-architecture quantization configs +- `quantize_model_per_safetensor()` — file-to-file quantization (MXFP4 path) +- `ModelQuantizer` / `export_safetensors()` — in-memory quantization (INT4/INT8 path) +- `OCP_MXFP4Spec`, `Int4PerGroupSpec` — precision-specific quantization specs + +## Code style + +- Ruff configured: 100-char line length, Python 3.10 target +- Lint rules: E, W, F, I (isort), B (bugbear), C4, UP, ARG, SIM +- Double quotes, space indentation +- `contribs/` directory is excluded from linting + +## Key patterns + +- **vLLM fused layer alignment**: `_align_exclude_groups()` ensures q/k/v projections and gate/up projections are excluded together (vLLM fuses these into `qkv_proj` and `gate_up_proj`) +- **AWQ/GPTQ algorithm support**: Enabled via config validation matrix in `constants.ALGORITHM_PRECISION_SUPPORT`. Valid combinations: + - RTN: all precisions (int4, int4_64, int4_32, int8, fp8, mxfp4, mxfp6, uint4) + - AWQ: INT4 only (int4, int4_64, int4_32) — activation-aware, Quark `AwqProcessor` + - GPTQ: INT4 only (int4) — Hessian-based, Quark `GptqProcessor` + - Invalid combos (e.g., AWQ+MXFP4, GPTQ+INT8) raise `ValueError` in `UnifiedConfig.validate()` +- **Sensitivity analysis algorithm-awareness**: `SequentialSensitivityAnalyzer._build_quant_config_for_scoring()` passes actual algorithm (not RTN proxy) to `LLMTemplate.get_config()` for correct Quark spec (critical for AWQ/GPTQ accuracy) +- **Backward compat aliases**: `QuantizationConfig = UnifiedConfig`, `AutoQuantizer = UnifiedQuantizer` +- **HF hub resolution**: File2file path auto-resolves HF hub IDs to local cache via `snapshot_download` + +## Testing environment + +Remote server mi355-gpu-16 (aac14 cluster) with MI355 GPUs. Use podman containers with `rocm/vllm-dev:nightly` image which includes PyTorch, Quark, and all dependencies. See `memory/reference_mi355_server.md` for access details. diff --git a/README.md b/README.md index a464c66..a0f3808 100644 --- a/README.md +++ b/README.md @@ -82,53 +82,6 @@ docker build -f docker/Dockerfile.rocm.dev -t quanto:rocm-dev . docker run --device=/dev/kfd --device=/dev/dri --group-add video -v $(pwd):/workspace -w /workspace quanto:rocm-dev bash ``` -## Project Structure - -``` -quanto/ -├── pyproject.toml # Package configuration -├── README.md # This file -├── requirements.txt # Base requirements -├── requirements-nvidia.txt # NVIDIA-specific deps -├── requirements-rocm.txt # ROCm-specific deps -├── contribs/ -│ └── quark/ # AMD Quark (submodule) -├── docker/ -│ ├── Dockerfile.cuda # Pre-built for CUDA -│ ├── Dockerfile.cuda.dev # Development for CUDA -│ ├── Dockerfile.rocm # Pre-built for ROCm -│ └── Dockerfile.rocm.dev # Development for ROCm -├── docs/ -│ └── examples.md # Experiment results -├── examples/ # Example scripts -├── scripts/ -│ └── repack.py # Weight packing utilities -├── src/quanto/ # Main package -│ ├── __init__.py -│ ├── __main__.py # CLI entry point -│ ├── constants.py # Shared constants -│ ├── core/ # Quantization engines -│ │ ├── base_quantizer.py -│ │ ├── auto_quantize.py -│ │ ├── layerwise_quant.py -│ │ ├── lazy_layerwise_quant.py -│ │ ├── iterative_quantizer.py -│ │ └── dequantize.py -│ ├── analysis/ # Layer analysis -│ │ ├── layer_analyzer.py -│ │ └── sensitivity_analyzer.py -│ ├── export/ # Export utilities -│ │ ├── hf_export.py -│ │ └── model_assembler.py -│ └── utils/ # Shared utilities -│ ├── calibration.py -│ ├── int4_pack.py -│ ├── logging.py -│ ├── memory.py -│ └── model_utils.py -└── tests/ # Test suite -``` - ## Usage ### Basic Usage diff --git a/references/llm-compressor b/contribs/llm-compressor similarity index 100% rename from references/llm-compressor rename to contribs/llm-compressor diff --git a/references/transformers b/contribs/transformers similarity index 100% rename from references/transformers rename to contribs/transformers diff --git a/references/vllm b/contribs/vllm similarity index 100% rename from references/vllm rename to contribs/vllm diff --git a/src/quanto/__main__.py b/src/quanto/__main__.py index b81d8dc..fa797c0 100644 --- a/src/quanto/__main__.py +++ b/src/quanto/__main__.py @@ -14,39 +14,31 @@ def main() -> int: """Main entry point that dispatches to quantize or dequantize.""" - parser = argparse.ArgumentParser( - description="Quanto: LLM Quantization Tool", - add_help=False, - ) - - # Add --dequantize flag to detect mode - parser.add_argument("--dequantize", action="store_true", help="Run dequantization mode") - parser.add_argument("--help", "-h", action="store_true", help="Show help") + # Check if --dequantize is in args + if "--dequantize" in sys.argv: + from quanto.core.dequantize import main as dequant_main - # Parse known args to detect mode - args, remaining = parser.parse_known_args() + return dequant_main() - if args.help: - parser.print_help() - print("\nModes:") + # Show top-level help only when no args or just --help with no other flags + if len(sys.argv) <= 1 or (len(sys.argv) == 2 and sys.argv[1] in ("--help", "-h")): + print("usage: python -m quanto [--dequantize] [options]") + print() + print("Quanto: LLM Quantization Tool") + print() + print("Modes:") print( - " Quantization: python -m quanto --model_path ... --output_dir ... --precision int4" + " Quantization: python -m quanto --model_path ... --output_dir ... --precision mxfp4" ) print(" Dequantization: python -m quanto --dequantize --model_path ... --output_dir ...") + print() + print("Run 'python -m quanto --model_path x --output_dir y --help' for full quantization options.") return 0 - if args.dequantize: - # Run dequantization - from quanto.core.dequantize import main as dequant_main - - # Add back --dequantize flag since dequantize module expects it - sys.argv = [sys.argv[0], "--dequantize"] + remaining - return dequant_main() - else: - # Run quantization - from quanto.core.auto_quantize import main as quant_main + # Default: quantization mode + from quanto.core.auto_quantize import main as quant_main - return quant_main() + return quant_main() if __name__ == "__main__": diff --git a/src/quanto/constants.py b/src/quanto/constants.py index 40aa38b..f5c1715 100644 --- a/src/quanto/constants.py +++ b/src/quanto/constants.py @@ -46,6 +46,13 @@ "phi": "phi", "phi3": "phi3", "phi4": "phi3", + "solar_open": "qwen3_moe", + "exaone": "llama", + "exaone4_5": "llama", + "exaone4_5_text": "llama", + "exaone_moe": "qwen3_moe", + "kimi_k2": "kimi_k25", + "kimi_k25": "kimi_k25", } # Default layers to exclude from quantization @@ -78,7 +85,15 @@ # Supported quantization algorithms SUPPORTED_ALGORITHMS: list[str] = [ + "rtn", "awq", "gptq", - "smoothquant", ] + +# Algorithm-Precision support matrix +# Defines which precisions are supported for each quantization algorithm +ALGORITHM_PRECISION_SUPPORT: dict[str, list[str]] = { + "rtn": ["int4", "int4_64", "int4_32", "int8", "fp8", "mxfp4", "mxfp6", "uint4"], + "awq": ["int4", "int4_64", "int4_32"], # AWQ is INT4-only (activation-aware) + "gptq": ["int4"], # GPTQ is INT4-only (Hessian-based) +} diff --git a/src/quanto/core/auto_quantize.py b/src/quanto/core/auto_quantize.py index 79dc6ea..092c67a 100644 --- a/src/quanto/core/auto_quantize.py +++ b/src/quanto/core/auto_quantize.py @@ -44,4 +44,97 @@ "QuantizationConfig", "UnifiedQuantizer", "UnifiedConfig", + "main", ] + + +def main() -> int: + """CLI entry point for quantization.""" + import argparse + import json + import sys + + parser = argparse.ArgumentParser( + description="Quanto: Quantize a model", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + # Required + parser.add_argument("--model_path", required=True, help="HuggingFace model ID or local path") + parser.add_argument("--output_dir", required=True, help="Output directory for quantized model") + + # Quantization settings + parser.add_argument( + "--precision", + default="mxfp4", + choices=["int4", "int4_64", "int4_32", "int8", "fp8", "mxfp4", "mxfp6", "uint4"], + help="Target precision", + ) + parser.add_argument("--algorithm", default="rtn", choices=["rtn", "awq", "gptq"], help="Quantization algorithm") + parser.add_argument("--memory_strategy", default="auto", choices=["full", "layerwise_cpu", "lazy", "auto"]) + + # Sensitivity analysis + parser.add_argument("--sensitivity_analysis", action="store_true", help="Enable iterative sensitivity analysis") + parser.add_argument("--sensitivity_threshold", type=float, default=0.0, help="Sensitivity threshold for layer exclusion") + parser.add_argument( + "--sensitivity_metric", + type=str, + default="relative", + choices=["relative", "mse", "mae", "cosine", "kl"], + help="Metric used to rank sensitive layers", + ) + parser.add_argument("--max_iterations", type=int, default=10, help="Max iterations for sensitivity analysis") + + # Layer exclusion + parser.add_argument("--exclude_layers", nargs="*", help="Layer name patterns to exclude from quantization") + parser.add_argument("--exclude_layers_file", help="JSON file containing exclude layer list") + + # Calibration data + parser.add_argument("--calibration_data", default="pileval", help="Calibration dataset name or path") + parser.add_argument("--num_calib_samples", type=int, default=128, help="Number of calibration samples") + parser.add_argument("--seq_len", type=int, default=512, help="Sequence length for calibration") + + # Other + parser.add_argument("--device", default="cuda", help="Device (cuda, cuda:0, cpu)") + parser.add_argument("--trust_remote_code", action="store_true", default=True) + parser.add_argument("--no_trust_remote_code", action="store_true", help="Disable trust_remote_code") + parser.add_argument("--skip_evaluation", action="store_true", help="Skip perplexity evaluation") + parser.add_argument("--sensitivity_cache_on_gpu", action="store_true", default=True) + + args = parser.parse_args() + + # Handle exclude_layers from file + exclude_layers = args.exclude_layers + if args.exclude_layers_file: + with open(args.exclude_layers_file) as f: + exclude_layers = json.load(f) + + config = UnifiedConfig( + model_path=args.model_path, + output_dir=args.output_dir, + precision=args.precision, + algorithm=args.algorithm, + memory_strategy=args.memory_strategy, + sensitivity_analysis=args.sensitivity_analysis, + sensitivity_threshold=args.sensitivity_threshold, + sensitivity_metric=args.sensitivity_metric, + max_iterations=args.max_iterations, + exclude_layers=exclude_layers, + calibration_data=args.calibration_data, + num_calib_samples=args.num_calib_samples, + seq_len=args.seq_len, + device=args.device, + trust_remote_code=not args.no_trust_remote_code, + skip_evaluation=args.skip_evaluation, + sensitivity_cache_on_gpu=args.sensitivity_cache_on_gpu, + ) + + quantizer = UnifiedQuantizer(config) + result = quantizer.run() + + if result.success: + print(json.dumps(result.to_dict(), indent=2)) + return 0 + else: + print(f"FAILED: {result.error_message}", file=sys.stderr) + return 1 diff --git a/src/quanto/core/config.py b/src/quanto/core/config.py index 5f44c3a..2cb5919 100644 --- a/src/quanto/core/config.py +++ b/src/quanto/core/config.py @@ -7,7 +7,7 @@ from __future__ import annotations -from dataclasses import dataclass, field +from dataclasses import dataclass from typing import Any, Literal @@ -42,6 +42,7 @@ class UnifiedConfig: aggressive_exclusion: Use aggressive layer exclusion rules sensitivity_analysis: Enable sequential sensitivity analysis for layer exclusion sensitivity_threshold: Threshold for excluding sensitive layers + sensitivity_metric: Metric used to rank sensitive layers sensitivity_cache_on_gpu: Cache activations on GPU (faster, more memory) skip_evaluation: Skip perplexity evaluation trust_remote_code: Trust remote code when loading models @@ -81,6 +82,7 @@ class UnifiedConfig: # Sensitivity-based exclusion sensitivity_analysis: bool = False # Enable sequential sensitivity analysis sensitivity_threshold: float = 0.0 # Threshold for excluding sensitive layers (0 = disabled, typical values: 0.12-0.15 for INT4) + sensitivity_metric: str = "relative" # One of: relative, mse, mae, cosine, kl sensitivity_cache_on_gpu: bool = True # Cache activations on GPU (faster, uses more memory) max_iterations: int = 10 # Maximum iterations for iterative sensitivity analysis (1 = single-pass) @@ -94,6 +96,9 @@ class UnifiedConfig: # Layer batch size for lazy mode (number of layers to process in parallel) layer_batch_size: int = 4 + # Quantization algorithm: "rtn" (round-to-nearest, default), "awq", "gptq" + algorithm: str = "rtn" + def __post_init__(self) -> None: """Validate configuration after initialization.""" self.validate() @@ -147,6 +152,31 @@ def validate(self) -> None: if self.max_iterations < 1: raise ValueError(f"max_iterations must be >= 1, got {self.max_iterations}") + # Validate sensitivity metric + valid_sensitivity_metrics = ["relative", "mse", "mae", "cosine", "kl"] + if self.sensitivity_metric not in valid_sensitivity_metrics: + raise ValueError( + f"Invalid sensitivity_metric '{self.sensitivity_metric}'. " + f"Must be one of: {valid_sensitivity_metrics}" + ) + + # Validate algorithm and precision combination + from ..constants import ALGORITHM_PRECISION_SUPPORT + + valid_algorithms = list(ALGORITHM_PRECISION_SUPPORT.keys()) + if self.algorithm not in valid_algorithms: + raise ValueError( + f"Invalid algorithm '{self.algorithm}'. Must be one of: {valid_algorithms}" + ) + + # Check if precision is supported for this algorithm + supported_precisions = ALGORITHM_PRECISION_SUPPORT[self.algorithm] + if self.precision not in supported_precisions: + raise ValueError( + f"Precision '{self.precision}' not supported for algorithm '{self.algorithm}'. " + f"Supported precisions: {supported_precisions}" + ) + def to_dict(self) -> dict[str, Any]: """Convert configuration to dictionary.""" return { @@ -165,12 +195,14 @@ def to_dict(self) -> dict[str, Any]: "aggressive_exclusion": self.aggressive_exclusion, "sensitivity_analysis": self.sensitivity_analysis, "sensitivity_threshold": self.sensitivity_threshold, + "sensitivity_metric": self.sensitivity_metric, "sensitivity_cache_on_gpu": self.sensitivity_cache_on_gpu, "max_iterations": self.max_iterations, "skip_evaluation": self.skip_evaluation, "trust_remote_code": self.trust_remote_code, "debug_dir": self.debug_dir, "layer_batch_size": self.layer_batch_size, + "algorithm": self.algorithm, } @classmethod diff --git a/src/quanto/core/sensitivity/sequential_analyzer.py b/src/quanto/core/sensitivity/sequential_analyzer.py index 7c1fac9..9a9da09 100644 --- a/src/quanto/core/sensitivity/sequential_analyzer.py +++ b/src/quanto/core/sensitivity/sequential_analyzer.py @@ -52,6 +52,7 @@ def __init__( metric: SensitivityMetric = SensitivityMetric.RELATIVE_NORM, cache_on_gpu: bool = True, initial_exclude_layers: list[str] | None = None, + template: object | None = None, ): """ Initialize the analyzer. @@ -61,11 +62,13 @@ def __init__( metric: Sensitivity metric to use cache_on_gpu: Store activations on GPU by default initial_exclude_layers: Layers to skip during analysis (already excluded) + template: LLMTemplate instance for precision-aware quantization config """ self.config = config self.metric = metric self.cache_on_gpu = cache_on_gpu self.initial_exclude_layers = initial_exclude_layers or [] + self.template = template # Components self.cache = ActivationCache( @@ -324,6 +327,10 @@ def _quantize_layer(self, layer: nn.Module, layer_name: str) -> nn.Module: """ Quantize a single layer for sensitivity testing. + Uses the actual target precision (MXFP4, INT4, FP8, etc.) rather than + a hardcoded INT4 proxy, so sensitivity scores accurately reflect the + quantization scheme being applied. + Args: layer: The layer module to quantize layer_name: Name of the layer @@ -332,15 +339,10 @@ def _quantize_layer(self, layer: nn.Module, layer_name: str) -> nn.Module: Quantized layer """ from quark.torch import ModelQuantizer - from quark.torch.quantization.config.config import Int4PerGroupSpec, QConfig, QLayerConfig - - # Create quantization config - # ch_axis=0 for per-row quantization (output channel dimension) - quant_config = QConfig( - global_quant_config=QLayerConfig( - weight=Int4PerGroupSpec(ch_axis=0, group_size=128).to_quantization_spec() - ), - ) + + from ...constants import PRECISION_TO_SCHEME + + quant_config = self._build_quant_config_for_scoring() # Quantize quantizer = ModelQuantizer(quant_config) @@ -355,6 +357,59 @@ def _quantize_layer(self, layer: nn.Module, layer_name: str) -> nn.Module: return quantized_layer + def _build_quant_config_for_scoring(self): + """ + Build quantization config matching the target precision and algorithm. + + Uses the LLMTemplate if available (produces architecture-specific configs), + otherwise falls back to building a config from the precision's Quark Spec class. + """ + from quark.torch.quantization.config.config import QConfig, QLayerConfig + + from ...constants import PRECISION_TO_SCHEME + + precision = self.config.precision + algorithm = (self.config.algorithm or "rtn").lower() + scheme = PRECISION_TO_SCHEME.get(precision, precision) + + # Prefer template-based config (architecture-specific) + if self.template: + return self.template.get_config( + scheme=scheme, + algorithm=algorithm if algorithm != "rtn" else None, + exclude_layers=[], + ) + + # Fallback: build config from precision spec + if precision.startswith("mxfp4"): + from quark.torch.quantization.config.config import OCP_MXFP4Spec + + spec = OCP_MXFP4Spec(ch_axis=0).to_quantization_spec() + elif precision.startswith("mxfp6"): + from quark.torch.quantization.config.config import OCP_MXFP6E3M2Spec + + spec = OCP_MXFP6E3M2Spec(ch_axis=0).to_quantization_spec() + elif precision.startswith("int4") or precision.startswith("uint4"): + from quark.torch.quantization.config.config import Int4PerGroupSpec + + group_size = 128 + if "64" in precision: + group_size = 64 + elif "32" in precision: + group_size = 32 + spec = Int4PerGroupSpec(ch_axis=0, group_size=group_size).to_quantization_spec() + else: + # Default fallback to INT4 for unknown precisions + from quark.torch.quantization.config.config import Int4PerGroupSpec + + spec = Int4PerGroupSpec(ch_axis=0, group_size=128).to_quantization_spec() + + # Note: algorithm parameter (if needed by Quark) would be passed here + # Currently Quark's fallback configs don't take algorithm parameter + return QConfig( + global_quant_config=QLayerConfig(weight=spec), + ) + def analyze(self) -> AnalysisResult: """ Run sequential sensitivity analysis. diff --git a/src/quanto/core/unified_quantizer.py b/src/quanto/core/unified_quantizer.py index 8d78cf2..b607518 100644 --- a/src/quanto/core/unified_quantizer.py +++ b/src/quanto/core/unified_quantizer.py @@ -43,6 +43,7 @@ from .base_quantizer import QuantizationResult from .config import UnifiedConfig from .sensitivity import SequentialSensitivityAnalyzer +from .sensitivity.scorer import SensitivityMetric class UnifiedQuantizer: @@ -78,6 +79,8 @@ def __init__(self, config: UnifiedConfig): self.safetensors_files = [] self.weight_index = {} # Maps weight name to file path self.timing = {} + self._resolved_algorithm: str | None = None + self._calibration_loader_cache = None def _log(self, message: str) -> None: """Print log message with timestamp.""" @@ -111,24 +114,89 @@ def _get_template(self) -> LLMTemplate | None: self._log(f"Warning: No template found for model type '{self.model_type}'") return self.template + def _resolve_algorithm(self) -> str | None: + """Resolve the requested quantization algorithm to pass to Quark. + + Returns: + None for RTN (Quark default), algorithm name for AWQ/GPTQ + """ + if self._resolved_algorithm is not None: + return self._resolved_algorithm + + algorithm = (self.config.algorithm or "rtn").lower() + if algorithm == "rtn": + self._resolved_algorithm = None + return None + + # AWQ and GPTQ are now supported via Quark's LLMTemplate + if algorithm in {"awq", "gptq"}: + self._resolved_algorithm = algorithm + return algorithm + + raise ValueError(f"Unsupported quantization algorithm: '{self.config.algorithm}'") + + def _resolve_sensitivity_metric(self) -> SensitivityMetric: + """Resolve configured sensitivity metric string to enum value.""" + metric_name = (self.config.sensitivity_metric or "relative").lower() + mapping = { + "relative": SensitivityMetric.RELATIVE_NORM, + "mse": SensitivityMetric.MSE, + "mae": SensitivityMetric.MAE, + "cosine": SensitivityMetric.COSINE, + "kl": SensitivityMetric.KL_DIVERGENCE, + } + try: + return mapping[metric_name] + except KeyError as exc: + valid = ", ".join(mapping.keys()) + raise ValueError( + f"Invalid sensitivity_metric '{self.config.sensitivity_metric}'. " + f"Must be one of: {valid}" + ) from exc + + def _get_calibration_dataloader(self): + """Load and cache the calibration dataloader.""" + if self._calibration_loader_cache is None: + if self.tokenizer is None: + raise RuntimeError("Tokenizer must be initialized before loading calibration data") + self._calibration_loader_cache = get_calib_dataloader( + dataset_name_or_path=self.config.calibration_data, + tokenizer=self.tokenizer, + batch_size=self.config.batch_size, + num_calib_data=self.config.num_calib_samples, + seqlen=self.config.seq_len, + device=self.config.device, + ) + return self._calibration_loader_cache + def _setup(self) -> None: """Load config, tokenizer, and build weight index.""" start_time = time.time() self._log("Setting up quantization...") # Load HuggingFace config (no weights) - self.hf_config = AutoConfig.from_pretrained( - self.config.model_path, trust_remote_code=self.config.trust_remote_code - ) + try: + self.hf_config = AutoConfig.from_pretrained( + self.config.model_path, trust_remote_code=self.config.trust_remote_code + ) + except (ValueError, KeyError) as e: + # Fallback for models not yet supported by transformers (e.g., exaone4_5) + self._log(f"AutoConfig failed ({e.__class__.__name__}), using JSON fallback") + self.hf_config = self._load_config_from_json() + self._detect_model_type() self._get_template() # Load tokenizer - self.tokenizer = AutoTokenizer.from_pretrained( - self.config.model_path, trust_remote_code=self.config.trust_remote_code - ) - if self.tokenizer.pad_token is None: - self.tokenizer.pad_token = self.tokenizer.eos_token + try: + self.tokenizer = AutoTokenizer.from_pretrained( + self.config.model_path, trust_remote_code=self.config.trust_remote_code + ) + if self.tokenizer.pad_token is None: + self.tokenizer.pad_token = self.tokenizer.eos_token + except (ValueError, KeyError, OSError) as e: + self._log(f"AutoTokenizer failed ({e.__class__.__name__}), skipping tokenizer") + self.tokenizer = None # Create output directory os.makedirs(self.config.output_dir, exist_ok=True) @@ -137,6 +205,29 @@ def _setup(self) -> None: self.timing["setup"] = time.time() - start_time self._log(f"Setup completed in {self.timing['setup']:.2f}s") + def _load_config_from_json(self): + """Fallback config loading when AutoConfig fails (unsupported model types).""" + from pathlib import Path + from types import SimpleNamespace + + model_path = Path(self.config.model_path) + config_file = model_path / "config.json" + + # If model_path is a HF hub ID, resolve to local cache + if not config_file.exists(): + from huggingface_hub import hf_hub_download + + config_file = Path(hf_hub_download(self.config.model_path, "config.json")) + + with open(config_file) as f: + config_dict = json.load(f) + + # For multimodal models, text_config holds the LLM settings + text_config = config_dict.get("text_config", {}) + merged = {**config_dict, **text_config} + + return SimpleNamespace(**merged) + def _get_layer_info(self) -> dict[str, Any]: """Get layer information from config.""" info = { @@ -201,6 +292,9 @@ def _determine_exclude_layers(self) -> list[str]: # Add standard patterns exclude.extend(["*embed*", "*norm*"]) + # Exclude MoE router gates (not gate_proj FFN layers) + exclude.append("*.gate") + if self.config.aggressive_exclusion: exclude.extend(["*gate*"]) @@ -216,6 +310,55 @@ def _determine_exclude_layers(self) -> list[str]: exclude.extend(sensitive_layers) # Remove duplicates + exclude = list(set(exclude)) + + # Align exclusions for vLLM fused layer compatibility + exclude = self._align_exclude_groups(exclude) + + return exclude + + def _align_exclude_groups(self, exclude: list[str]) -> list[str]: + """ + Ensure fused projection groups are excluded together for vLLM compatibility. + + vLLM fuses certain projections into single linear layers: + - qkv_proj: q_proj + k_proj + v_proj (must all share same scheme) + - gate_up_proj: gate_proj + up_proj (must all share same scheme) + + If any projection in a group is excluded, exclude the entire group. + """ + # Define fused groups: suffixes that must be excluded together + fused_groups = [ + ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"], + ["mlp.gate_proj", "mlp.up_proj"], + ["mlp.shared_experts.gate_proj", "mlp.shared_experts.up_proj"], + ] + + # Find layer prefixes from exclude list (e.g., "model.layers.0") + added = set() + for layer_name in list(exclude): + # Skip glob patterns + if "*" in layer_name: + continue + + for group in fused_groups: + # Check if this excluded layer belongs to a fused group + for suffix in group: + if layer_name.endswith(suffix): + # Extract the prefix (e.g., "model.layers.0") + prefix = layer_name[: -len(suffix)] + # Add all members of this group + for member_suffix in group: + member = prefix + member_suffix + if member not in exclude and member not in added: + added.add(member) + self._log(f" + {member} (aligned with {layer_name})") + break + + if added: + self._log(f"Aligned {len(added)} additional layers for vLLM fused layer compatibility") + + exclude.extend(added) return list(set(exclude)) def _analyze_sensitive_layers(self) -> list[str]: @@ -261,7 +404,9 @@ def _run_sequential_sensitivity_analysis(self) -> list[str]: analyzer = SequentialSensitivityAnalyzer( config=self.config, + metric=self._resolve_sensitivity_metric(), cache_on_gpu=self.config.sensitivity_cache_on_gpu, + template=self.template, ) result = analyzer.analyze() @@ -330,8 +475,10 @@ def _run_iterative_sensitivity_analysis(self) -> list[str]: # Create analyzer with current exclusion list analyzer = SequentialSensitivityAnalyzer( config=self.config, + metric=self._resolve_sensitivity_metric(), cache_on_gpu=cache_on_gpu, initial_exclude_layers=all_excluded, + template=self.template, ) # Run analysis @@ -447,16 +594,24 @@ def _create_quant_config(self, exclude_layers: list[str]) -> QConfig: quant_scheme = self._get_quant_scheme() self._log(f"Using quantization scheme: {quant_scheme}") + # Determine algorithm (None for RTN, raise for unsupported) + algorithm = self._resolve_algorithm() + if algorithm: + self._log(f"Using quantization algorithm: {algorithm}") + else: + self._log("Using quantization algorithm: rtn") + # Create base quant config if self.template: quant_config = self.template.get_config( scheme=quant_scheme, + algorithm=algorithm, exclude_layers=exclude_layers, ) else: quant_config = QConfig( global_quant_config=QLayerConfig( - weight=Int4PerGroupSpec(group_size=128).to_quantization_spec() + weight=Int4PerGroupSpec(group_size=128, ch_axis=0).to_quantization_spec() ), exclude=exclude_layers, ) @@ -853,6 +1008,7 @@ def _run_lazy_quantization(self) -> QuantizationResult: self._log("\n=== Assembling HuggingFace format ===") self._assemble_hf_format() + self.timing["total"] = time.time() - total_start result.success = True @@ -1127,14 +1283,7 @@ def _run_full_gpu_quantization(self) -> QuantizationResult: # Get calibration data self._log("Loading calibration data...") - calib_loader = get_calib_dataloader( - dataset_name_or_path=self.config.calibration_data, - tokenizer=self.tokenizer, - batch_size=self.config.batch_size, - num_calib_data=self.config.num_calib_samples, - seqlen=self.config.seq_len, - device=self.config.device, - ) + calib_loader = self._get_calibration_dataloader() # Quantize self._log("Quantizing model...") @@ -1160,6 +1309,88 @@ def _run_full_gpu_quantization(self) -> QuantizationResult: self.tokenizer.save_pretrained(self.config.output_dir) + + self.timing["total"] = time.time() - total_start + + result.success = True + result.output_dir = self.config.output_dir + result.model_type = self.model_type + result.quant_scheme = self._get_quant_scheme() + result.precision = self.config.precision + result.timing = self.timing + + self._print_summary(result) + + except Exception as e: + result.success = False + result.error_message = str(e) + self._log(f"Error during quantization: {e}") + import traceback + traceback.print_exc() + + return result + + def _run_file2file_quantization(self) -> QuantizationResult: + """ + Run file-to-file quantization using Quark's quantize_model_per_safetensor. + + Processes each safetensors shard independently without loading the full model + into memory. Produces properly packed uint8 weights with E8M0 scales that + vLLM can load natively as a Quark-quantized checkpoint. + + This is the recommended path for MXFP4/MXFP6 quantization, matching how AMD + publishes official MXFP4 models (e.g., Kimi-K2.5-MXFP4). + """ + from quark.torch.quantization.file2file_quantization import quantize_model_per_safetensor + + total_start = time.time() + result = QuantizationResult(success=False) + + try: + # Setup (load config, detect model type, get template) + self._setup() + + # Determine exclusions (including sensitivity analysis if enabled) + exclude_layers = self._determine_exclude_layers() + result.exclude_layers_used = exclude_layers + self._log(f"Exclude layers: {exclude_layers}") + + # Create quantization config + quant_config = self._create_quant_config(exclude_layers) + + self._log(f"\n{'=' * 60}") + self._log("FILE-TO-FILE QUANTIZATION") + self._log(f"{'=' * 60}") + self._log(f"Model: {self.config.model_path}") + self._log(f"Output: {self.config.output_dir}") + self._log(f"Precision: {self.config.precision}") + self._log(f"Device: {self.config.device}") + self._log(f"{'=' * 60}") + + # Resolve model path to local directory + # file2file requires a local path with safetensors files, not a HF hub ID + model_path = self.config.model_path + if not os.path.isdir(model_path): + from huggingface_hub import snapshot_download + + self._log(f"Downloading model from HuggingFace: {model_path}") + model_path = snapshot_download(model_path) + self._log(f"Model downloaded to: {model_path}") + + # Run file-to-file quantization + self._log("Running file-to-file quantization...") + quant_start = time.time() + + quantize_model_per_safetensor( + pretrained_model_path=model_path, + quant_config=quant_config, + save_path=self.config.output_dir, + device=self.config.device, + ) + + self.timing["quantization"] = time.time() - quant_start + self._log(f"File-to-file quantization completed in {self.timing['quantization']:.2f}s") + self.timing["total"] = time.time() - total_start result.success = True @@ -1209,18 +1440,27 @@ def run(self) -> QuantizationResult: Returns: QuantizationResult with details of the quantization """ - # Determine strategy + # Resolve algorithm early to provide immediate feedback + self._resolve_algorithm() + + # Use file-to-file for MXFP precisions (produces vLLM-compatible packed uint8) + # This path skips auto-strategy detection since it doesn't need the model in memory + if self.config.precision.startswith("mxfp"): + return self._run_file2file_quantization() + + # Determine memory strategy for non-MXFP precisions if self.config.memory_strategy == "auto": - # Need to load config first for auto-detection - self.hf_config = AutoConfig.from_pretrained( - self.config.model_path, trust_remote_code=self.config.trust_remote_code - ) + try: + self.hf_config = AutoConfig.from_pretrained( + self.config.model_path, trust_remote_code=self.config.trust_remote_code + ) + except (ValueError, KeyError): + self.hf_config = self._load_config_from_json() strategy = self._auto_detect_strategy() self._log(f"Auto-detected memory strategy: {strategy}") else: strategy = self.config.memory_strategy - # Dispatch to appropriate strategy if strategy == "lazy": return self._run_lazy_quantization() elif strategy == "layerwise_cpu": diff --git a/src/quanto/utils/model_utils.py b/src/quanto/utils/model_utils.py index d979624..a224af6 100644 --- a/src/quanto/utils/model_utils.py +++ b/src/quanto/utils/model_utils.py @@ -35,9 +35,17 @@ def detect_model_type(model_path: str, trust_remote_code: bool = True) -> str: config = json.load(f) model_type = config.get("model_type", config.get("architectures", ["unknown"])[0]) else: - # Load config from model using transformers - config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code) - model_type = getattr(config, "model_type", getattr(config, "architectures", ["unknown"])[0]) + # Try AutoConfig first, fall back to JSON download for unsupported model types + try: + config = AutoConfig.from_pretrained(model_path, trust_remote_code=trust_remote_code) + model_type = getattr(config, "model_type", getattr(config, "architectures", ["unknown"])[0]) + except (ValueError, KeyError): + from huggingface_hub import hf_hub_download + + config_file = hf_hub_download(model_path, "config.json") + with open(config_file) as f: + config = json.load(f) + model_type = config.get("model_type", config.get("architectures", ["unknown"])[0]) return model_type diff --git a/tests/test_unified_quantizer.py b/tests/test_unified_quantizer.py index e2b93ac..33f450b 100644 --- a/tests/test_unified_quantizer.py +++ b/tests/test_unified_quantizer.py @@ -21,6 +21,7 @@ sys.path.insert(0, str(Path(__file__).parent.parent / "src")) from quanto import UnifiedConfig, UnifiedQuantizer +from quanto.core.sensitivity.scorer import SensitivityMetric class TestUnifiedConfig: @@ -39,6 +40,7 @@ def test_default_config(self): assert config.calibration_data == "pileval" assert config.num_calib_samples == 128 assert config.device == "cuda" + assert config.sensitivity_metric == "relative" def test_custom_config(self): """Test custom configuration values.""" @@ -99,6 +101,27 @@ def test_to_dict(self): assert d["output_dir"] == "/tmp/output" assert d["precision"] == "int4" assert d["memory_strategy"] == "auto" + assert d["sensitivity_metric"] == "relative" + + def test_invalid_sensitivity_metric(self): + """Test that invalid sensitivity metric raises error.""" + with pytest.raises(ValueError, match="Invalid sensitivity_metric"): + UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + sensitivity_metric="invalid", + ) + + def test_sensitivity_metric_mapping(self): + """Test sensitivity metric mapping to enum values.""" + config = UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + sensitivity_metric="mse", + ) + quantizer = UnifiedQuantizer(config) + + assert quantizer._resolve_sensitivity_metric() == SensitivityMetric.MSE def test_from_dict(self): """Test config deserialization.""" @@ -112,6 +135,58 @@ def test_from_dict(self): assert config.model_path == "/tmp/model" assert config.precision == "fp8" + def test_algorithm_precision_validation(self): + """Test that algorithm-precision combinations are validated.""" + # AWQ only supports INT4 precisions + with pytest.raises(ValueError, match="Precision.*not supported.*algorithm"): + UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + algorithm="awq", + precision="mxfp4", # MXFP4 not supported for AWQ + ) + + # GPTQ only supports INT4 + with pytest.raises(ValueError, match="Precision.*not supported.*algorithm"): + UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + algorithm="gptq", + precision="int8", # INT8 not supported for GPTQ + ) + + def test_algorithm_precision_supported(self): + """Test that valid algorithm-precision combinations are accepted.""" + # AWQ + INT4 should be valid + config = UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + algorithm="awq", + precision="int4", + ) + assert config.algorithm == "awq" + assert config.precision == "int4" + + # GPTQ + INT4 should be valid + config = UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + algorithm="gptq", + precision="int4", + ) + assert config.algorithm == "gptq" + assert config.precision == "int4" + + # RTN + MXFP4 should be valid + config = UnifiedConfig( + model_path="/tmp/model", + output_dir="/tmp/output", + algorithm="rtn", + precision="mxfp4", + ) + assert config.algorithm == "rtn" + assert config.precision == "mxfp4" + class TestAutoDetectStrategy: """Tests for auto-detect strategy logic."""