From 1fb9697668b9c972771b1d5c648e9ee61fc45bba Mon Sep 17 00:00:00 2001
From: Ziming <frankziming26@outlook.com>
Date: Thu, 2 Apr 2026 13:59:43 -0400
Subject: [PATCH 1/4] [Examples] Add Gemma 4 E4B NVFP4A16 quantization example

Add NVFP4A16 weight-only quantization example for google/gemma-4-E4B-it.
Includes a Dockerfile since Gemma 4 requires transformers from git main
which is newer than the version currently pinned by llmcompressor.

The ignore list skips vision_tower, audio_tower, embed_vision, and
embed_audio modules which are specific to Gemma 4's multimodal
architecture. Uses AutoModelForImageTextToText and AutoProcessor
as required by the Gemma 4 model class.

Tested end-to-end: quantization, sample generation, and model saving
all complete successfully.

Signed-off-by: Ziming <frankziming26@outlook.com>
---
 .../nvfp4/Dockerfile.gemma4                   | 24 ++++++++++
 .../nvfp4/gemma4_example.py                   | 45 +++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4
 create mode 100644 examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py

diff --git a/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4 b/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4
new file mode 100644
index 0000000000..2e7d02377c
--- /dev/null
+++ b/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4
@@ -0,0 +1,24 @@
+FROM nvcr.io/nvidia/pytorch:25.04-py3
+
+WORKDIR /workspace
+
+# Install llmcompressor and upgrade transformers for Gemma 4 support.
+# Gemma 4 (model_type: gemma4) requires transformers from git main which is newer
+# than the version currently pinned by llmcompressor.
+#
+# Step 1: Install llmcompressor (keeps NVIDIA constraint file so torch/cuda stay).
+# Step 2: Force-upgrade transformers, huggingface_hub, regex (bypass constraints).
+RUN pip install --no-deps git+https://github.com/vllm-project/llm-compressor.git \
+        "compressed-tensors>=0.14.1a2" loguru "datasets>=4.0.0" accelerate \
+        "auto-round>=0.10.2" nvidia-ml-py && \
+    PIP_CONSTRAINT="" pip install multiprocess dill xxhash fsspec && \
+    PIP_CONSTRAINT="" pip install --force-reinstall \
+        git+https://github.com/huggingface/transformers.git \
+        "huggingface_hub>=1.5.0" \
+        "regex>=2025.10.22" \
+        tokenizers safetensors && \
+    pip install "numpy<2"
+
+COPY gemma4_example.py .
+
+ENTRYPOINT ["python", "gemma4_example.py"]
diff --git a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
new file mode 100644
index 0000000000..f1d19ed1e3
--- /dev/null
+++ b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
@@ -0,0 +1,45 @@
+from compressed_tensors.offload import dispatch_model
+from transformers import AutoModelForImageTextToText, AutoProcessor
+
+from llmcompressor import oneshot
+from llmcompressor.modifiers.quantization import QuantizationModifier
+
+# Load model.
+MODEL_ID = "google/gemma-4-E4B-it"
+model = AutoModelForImageTextToText.from_pretrained(MODEL_ID, dtype="auto")
+processor = AutoProcessor.from_pretrained(MODEL_ID)
+
+# Configure the quantization algorithm and scheme.
+# In this case, we:
+#   * quantize the weights to fp4 with per group 16 via ptq
+#   * skip the vision encoder, audio encoder, embedding projections, and lm_head
+recipe = QuantizationModifier(
+    targets="Linear",
+    scheme="NVFP4A16",
+    ignore=[
+        "lm_head",
+        "re:.*vision_tower.*",
+        "re:.*audio_tower.*",
+        "re:.*embed_vision.*",
+        "re:.*embed_audio.*",
+    ],
+)
+
+# Apply quantization.
+oneshot(model=model, recipe=recipe)
+
+print("\n\n========== SAMPLE GENERATION ==============")
+dispatch_model(model)
+messages = [
+    {"role": "user", "content": "Hello my name is"},
+]
+text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+inputs = processor(text=text, return_tensors="pt").to(model.device)
+output = model.generate(**inputs, max_new_tokens=100)
+print(processor.decode(output[0], skip_special_tokens=True))
+print("==========================================\n\n")
+
+# Save to disk in compressed-tensors format.
+SAVE_DIR = MODEL_ID.rstrip("/").split("/")[-1] + "-NVFP4A16"
+model.save_pretrained(SAVE_DIR, save_compressed=True)
+processor.save_pretrained(SAVE_DIR)

From eeb751b20c64b74001652cd0df18985900b03ad8 Mon Sep 17 00:00:00 2001
From: Ziming <frankziming26@outlook.com>
Date: Fri, 3 Apr 2026 02:07:40 -0400
Subject: [PATCH 2/4] Move Dockerfile install instructions into
 gemma4_example.py

Per review feedback, remove the standalone Dockerfile and add
install instructions as comments in the example script.

Signed-off-by: Ziming <frankziming26@outlook.com>
---
 .../nvfp4/Dockerfile.gemma4                   | 24 -------------------
 .../nvfp4/gemma4_example.py                   |  5 ++++
 2 files changed, 5 insertions(+), 24 deletions(-)
 delete mode 100644 examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4

diff --git a/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4 b/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4
deleted file mode 100644
index 2e7d02377c..0000000000
--- a/examples/quantization_w4a16_fp4/nvfp4/Dockerfile.gemma4
+++ /dev/null
@@ -1,24 +0,0 @@
-FROM nvcr.io/nvidia/pytorch:25.04-py3
-
-WORKDIR /workspace
-
-# Install llmcompressor and upgrade transformers for Gemma 4 support.
-# Gemma 4 (model_type: gemma4) requires transformers from git main which is newer
-# than the version currently pinned by llmcompressor.
-#
-# Step 1: Install llmcompressor (keeps NVIDIA constraint file so torch/cuda stay).
-# Step 2: Force-upgrade transformers, huggingface_hub, regex (bypass constraints).
-RUN pip install --no-deps git+https://github.com/vllm-project/llm-compressor.git \
-        "compressed-tensors>=0.14.1a2" loguru "datasets>=4.0.0" accelerate \
-        "auto-round>=0.10.2" nvidia-ml-py && \
-    PIP_CONSTRAINT="" pip install multiprocess dill xxhash fsspec && \
-    PIP_CONSTRAINT="" pip install --force-reinstall \
-        git+https://github.com/huggingface/transformers.git \
-        "huggingface_hub>=1.5.0" \
-        "regex>=2025.10.22" \
-        tokenizers safetensors && \
-    pip install "numpy<2"
-
-COPY gemma4_example.py .
-
-ENTRYPOINT ["python", "gemma4_example.py"]
diff --git a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
index f1d19ed1e3..52218b4f8b 100644
--- a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
+++ b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
@@ -1,3 +1,8 @@
+# Gemma 4 requires transformers >= 5.5.0 (model_type: gemma4).
+# If your llmcompressor pins an older version, install with:
+#   pip install --no-deps llmcompressor
+#   pip install git+https://github.com/huggingface/transformers.git
+
 from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForImageTextToText, AutoProcessor
 

From a2b2a3e37a65274ecb1697849fadb2db58ff21ab Mon Sep 17 00:00:00 2001
From: Ziming <frankziming26@outlook.com>
Date: Fri, 3 Apr 2026 13:41:35 -0400
Subject: [PATCH 3/4] Apply review suggestions: use pip install llmcompressor
 and transformers>=5.5

Signed-off-by: Ziming <frankziming26@outlook.com>
---
 examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
index 52218b4f8b..c17c691503 100644
--- a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
+++ b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
@@ -1,7 +1,7 @@
 # Gemma 4 requires transformers >= 5.5.0 (model_type: gemma4).
 # If your llmcompressor pins an older version, install with:
-#   pip install --no-deps llmcompressor
-#   pip install git+https://github.com/huggingface/transformers.git
+#   pip install llmcompressor
+#   pip install transformers>=5.5
 
 from compressed_tensors.offload import dispatch_model
 from transformers import AutoModelForImageTextToText, AutoProcessor

From b7903ce1362fef9b3d0366434a22950e0d237edf Mon Sep 17 00:00:00 2001
From: Ziming <frankziming26@outlook.com>
Date: Fri, 3 Apr 2026 16:27:01 -0400
Subject: [PATCH 4/4] Fix line length lint error in gemma4_example.py

Signed-off-by: Ziming <frankziming26@outlook.com>
---
 examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
index c17c691503..cce291d8bc 100644
--- a/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
+++ b/examples/quantization_w4a16_fp4/nvfp4/gemma4_example.py
@@ -38,7 +38,9 @@
 messages = [
     {"role": "user", "content": "Hello my name is"},
 ]
-text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+text = processor.apply_chat_template(
+    messages, add_generation_prompt=True, tokenize=False
+)
 inputs = processor(text=text, return_tensors="pt").to(model.device)
 output = model.generate(**inputs, max_new_tokens=100)
 print(processor.decode(output[0], skip_special_tokens=True))