quantumaikr · unamedkr · Apr 9, 2026 · Apr 9, 2026
diff --git a/README.ko.md b/README.ko.md
@@ -260,18 +260,23 @@ QK-norm이 적용된 모델은 key 벡터를 단위 구체로 정규화하여
 
 ## 고급 사용법
 
+> **참고:** 아래 `MODEL`은 **본인이 가진** GGUF 파일 경로의 자리표시자입니다. 위 Quick Start에서 다운로드한 `models/SmolLM2-135M-Instruct-Q8_0.gguf`를 그대로 써도 되고, 다른 GGUF 파일 경로로 바꿔도 됩니다. `model.gguf`라는 이름의 실제 파일은 존재하지 않습니다.
+
 ```bash
+# 디스크에 있는 GGUF 파일 경로를 지정 (Quick Start에서 받은 파일 예시):
+MODEL=models/SmolLM2-135M-Instruct-Q8_0.gguf
+
 # Delta 압축 (최대 컨텍스트, 8.5x)
-./build/quant model.gguf --chat -p "hello" -k uniform_3b -v q4 --delta
+./build/quant $MODEL --chat -p "hello" -k uniform_3b -v q4 --delta
 
 # PPL 벤치마크
-./build/quant model.gguf --ppl input.txt -k uniform_4b -v q4
+./build/quant $MODEL --ppl input.txt -k uniform_4b -v q4
 
 # 모델 정보
-./build/quant model.gguf --info
+./build/quant $MODEL --info
 
 # 성능 프로파일링
-./build/quant model.gguf --chat -p "hello" -n 50 --profile
+./build/quant $MODEL --chat -p "hello" -n 50 --profile
 ```
 
 ---
@@ -285,7 +290,7 @@ QK-norm이 적용된 모델은 key 벡터를 단위 구체로 정규화하여
 #include "quant.h"
 
 int main() {
-    quant_model* m = quant_load("model.gguf");
+    quant_model* m = quant_load("path/to/your.gguf");  // 본인의 GGUF 파일 경로
     quant_ctx*   c = quant_new(m, NULL);
 
     // 스트리밍
@@ -337,13 +342,15 @@ python3 -m http.server 8080       # 로컬 서버 시작
 **Docker** (의존성 제로, ~10MB 이미지):
 ```bash
 docker build -t quant.cpp .
-docker run -v ./models:/models quant.cpp /models/model.gguf -p "hello" -k uniform_4b -v q4
+docker run -v ./models:/models quant.cpp /models/SmolLM2-135M-Instruct-Q8_0.gguf -p "hello" -k uniform_4b -v q4
+# ./models 안에 둔 본인 GGUF 파일명으로 바꿔주세요
 ```
 
 **OpenAI 호환 서버** (`/v1/chat/completions`):
 ```bash
 cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build
-./build/quant-server model.gguf -p 8080 -k uniform_4b
+./build/quant-server models/SmolLM2-135M-Instruct-Q8_0.gguf -p 8080 -k uniform_4b
+# 본인의 GGUF 파일 경로로 바꿔서 사용
 
 # OpenAI Python SDK와 호환
 curl http://localhost:8080/v1/chat/completions \
@@ -363,7 +370,7 @@ cd bindings/python && pip install .
 ```python
 from quantcpp import Model
 
-with Model("model.gguf", kv_compress=1) as m:
+with Model("models/SmolLM2-135M-Instruct-Q8_0.gguf", kv_compress=1) as m:  # 본인의 GGUF 경로로 변경
     print(m.ask("프랑스의 수도는?"))
 
     # 스트리밍

diff --git a/README.md b/README.md
@@ -34,7 +34,7 @@ pip install quantcpp
 ```python
 from quantcpp import Model
 
-m = Model("model.gguf")
+m = Model("path/to/your.gguf")  # any GGUF file you have on disk
 print(m.ask("What is 2+2?"))
 
 # Streaming
@@ -310,18 +310,23 @@ Models with QK-norm normalize keys to the unit sphere, creating extremely sparse
 
 ## Advanced Usage
 
+> **Note:** `MODEL` below is a placeholder for **your** GGUF file path. The Quick Start above downloads `models/SmolLM2-135M-Instruct-Q8_0.gguf` — you can paste that path directly, or substitute any other GGUF you have. There is no file literally named `model.gguf`.
+
 ```bash
+# Pick any GGUF you have on disk (this is the one from Quick Start):
+MODEL=models/SmolLM2-135M-Instruct-Q8_0.gguf
+
 # Delta compression (maximum context, 8.5x)
-./build/quant model.gguf --chat -p "hello" -k uniform_3b -v q4 --delta
+./build/quant $MODEL --chat -p "hello" -k uniform_3b -v q4 --delta
 
 # Perplexity benchmark
-./build/quant model.gguf --ppl input.txt -k uniform_4b -v q4
+./build/quant $MODEL --ppl input.txt -k uniform_4b -v q4
 
 # Model info
-./build/quant model.gguf --info
+./build/quant $MODEL --info
 
 # Performance profiling
-./build/quant model.gguf --chat -p "hello" -n 50 --profile
+./build/quant $MODEL --chat -p "hello" -n 50 --profile
 ```
 
 ---
@@ -335,7 +340,7 @@ Models with QK-norm normalize keys to the unit sphere, creating extremely sparse
 #include "quant.h"
 
 int main() {
-    quant_model* m = quant_load("model.gguf");
+    quant_model* m = quant_load("path/to/your.gguf");  // any GGUF file
     quant_ctx*   c = quant_new(m, NULL);
 
     // Streaming
@@ -387,13 +392,15 @@ Everything runs client-side. Nothing is uploaded. KV compression active by defau
 **Docker** (zero-dependency, ~10MB image):
 ```bash
 docker build -t quant.cpp .
-docker run -v ./models:/models quant.cpp /models/model.gguf -p "hello" -k uniform_4b -v q4
+docker run -v ./models:/models quant.cpp /models/SmolLM2-135M-Instruct-Q8_0.gguf -p "hello" -k uniform_4b -v q4
+# Replace SmolLM2-135M-Instruct-Q8_0.gguf with whatever GGUF you placed in ./models
 ```
 
 **OpenAI-compatible server** (`/v1/chat/completions`):
 ```bash
 cmake -B build -DTQ_BUILD_SERVER=ON && cmake --build build
-./build/quant-server model.gguf -p 8080 -k uniform_4b
+./build/quant-server models/SmolLM2-135M-Instruct-Q8_0.gguf -p 8080 -k uniform_4b
+# Substitute your own GGUF path as needed
 
 # Works with the OpenAI Python SDK
 curl http://localhost:8080/v1/chat/completions \
@@ -413,7 +420,7 @@ cd bindings/python && pip install .
 ```python
 from quantcpp import Model
 
-with Model("model.gguf", kv_compress=1) as m:
+with Model("models/SmolLM2-135M-Instruct-Q8_0.gguf", kv_compress=1) as m:  # use your own GGUF path
     print(m.ask("What is the capital of France?"))
 
     # Streaming