llm_compression_masterclass/exercise_1_hello_world.py at main · host452b/llm_compression_masterclass · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
"""
================================================================================
Exercise 1: Hello World of Compression
================================================================================

Learning Objectives:
1. Understand the basic oneshot() API
2. Understand QuantizationModifier configuration
3. Understand FP8 Dynamic quantization scheme

Key Concepts:
- FP8 Dynamic: Weights are quantized to FP8, activations are dynamically
  quantized per-token during inference
- No calibration data needed for FP8 Dynamic
- This is the simplest starting point for model compression

Expected Output:
- A quantized model saved to disk
- Sample generation to verify the model works
================================================================================
"""

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

from llmcompressor import oneshot
from llmcompressor.modifiers.quantization import QuantizationModifier


def run_hello_world_quantization():
    """
    Step-by-step FP8 Dynamic Quantization.

    This exercise demonstrates the simplest possible quantization workflow.
    """

    print("=" * 70)
    print("Exercise 1: Hello World of Compression")
    print("=" * 70)

    # =========================================================================
    # STEP 1: Load Model and Tokenizer
    # =========================================================================
    # We use TinyLlama because it's small (~1.1B parameters) and downloads quickly.
    # In production, you would use a larger model like Llama-3-8B.

    model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    print(f"\n[Step 1] Loading model: {model_id}")
    print("This may take a moment if downloading for the first time...")

    try:
        # device_map="auto" automatically distributes model across available devices
        # torch_dtype="auto" uses the model's default precision (usually float16)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype="auto",
        )
        tokenizer = AutoTokenizer.from_pretrained(model_id)

        # Ensure pad_token is set (required for batched inference)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        print(f"Model loaded successfully!")
        print(f"Model dtype: {model.dtype}")
        print(f"Model device: {model.device}")

    except Exception as e:
        print(f"Error loading model: {e}")
        print("Tip: Make sure you have enough disk space and internet connection.")
        return

    # =========================================================================
    # STEP 2: Define the Quantization Recipe
    # =========================================================================
    # A Recipe is a declarative description of what optimizations to apply.
    # QuantizationModifier is the simplest modifier - it just applies PTQ.

    print("\n[Step 2] Configuring Quantization Recipe")

    recipe = QuantizationModifier(
        # targets: Which layers to quantize
        # "Linear" matches all torch.nn.Linear modules
        # You can also use regex patterns like "model.layers.\\d+.mlp.*"
        targets="Linear",

        # scheme: The quantization scheme to use
        # "FP8_DYNAMIC" means:
        #   - Weights: FP8 (8-bit floating point), static per-channel
        #   - Activations: FP8, dynamic per-token (computed at runtime)
        # This scheme does NOT require calibration data!
        scheme="FP8_DYNAMIC",

        # ignore: Layers to skip quantization
        # lm_head is the output layer that maps hidden states to vocabulary
        # It's very sensitive to quantization and should usually be kept in FP16
        ignore=["lm_head"],
    )

    print(f"Recipe configured:")
    print(f"  - Targets: Linear layers")
    print(f"  - Scheme: FP8_DYNAMIC")
    print(f"  - Ignored: lm_head")

    # =========================================================================
    # STEP 3: Apply Quantization
    # =========================================================================
    # oneshot() is the main entry point for compression.
    # For FP8_DYNAMIC, no calibration data is needed, so we don't pass dataset.

    print("\n[Step 3] Applying Quantization")
    print("This may take a few minutes...")

    try:
        # For FP8_DYNAMIC, we don't need calibration data
        # The oneshot function will automatically select DataFreePipeline
        oneshot(
            model=model,
            recipe=recipe,
            # No dataset needed for FP8_DYNAMIC!
        )
        print("Quantization completed successfully!")

    except Exception as e:
        print(f"Error during quantization: {e}")
        return

    # =========================================================================
    # STEP 4: Verify the Model Works
    # =========================================================================
    # Always generate some samples to verify the quantized model produces
    # sensible output.

    print("\n[Step 4] Verifying Model Output")

    try:
        # Prepare input
        prompt = "Hello, my name is"
        inputs = tokenizer(prompt, return_tensors="pt")
        inputs = {k: v.to(model.device) for k, v in inputs.items()}

        # Generate
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=50,
                do_sample=True,
                temperature=0.7,
                top_p=0.9,
            )

        # Decode and print
        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        print(f"\nPrompt: {prompt}")
        print(f"Generated: {generated_text}")

    except Exception as e:
        print(f"Error during generation: {e}")
        return

    # =========================================================================
    # STEP 5: Save the Quantized Model
    # =========================================================================
    # The model is saved in safetensors format with quantization metadata.
    # This can be directly loaded by vLLM for inference.

    print("\n[Step 5] Saving Quantized Model")

    output_dir = "./tinyllama-fp8-dynamic"

    try:
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        print(f"Model saved to: {output_dir}")

        # Print file sizes for comparison
        import os
        total_size = 0
        for f in os.listdir(output_dir):
            fpath = os.path.join(output_dir, f)
            if os.path.isfile(fpath):
                size = os.path.getsize(fpath)
                total_size += size
                print(f"  {f}: {size / 1024 / 1024:.2f} MB")
        print(f"Total size: {total_size / 1024 / 1024:.2f} MB")

    except Exception as e:
        print(f"Error saving model: {e}")
        return

    # =========================================================================
    # SUMMARY
    # =========================================================================
    print("\n" + "=" * 70)
    print("Exercise 1 Complete!")
    print("=" * 70)
    print("""
Key Takeaways:
1. QuantizationModifier is the simplest way to quantize a model
2. FP8_DYNAMIC doesn't need calibration data
3. Always ignore lm_head for better accuracy
4. Always verify output after quantization
5. Models are saved in safetensors format for vLLM compatibility

Next Steps:
- Try Exercise 2 to learn about GPTQ with calibration data
- Try Exercise 3 to inspect the quantized weights
""")


if __name__ == "__main__":
    run_hello_world_quantization()