ManuelSLemos
diff --git a/‎.gitignore‎
Lines changed: 3 additions & 2 deletions b/‎.gitignore‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 14 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 1 addition & 1 deletion b/‎Makefile‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 37 additions & 0 deletions b/‎README.md‎
Lines changed: 37 additions & 0 deletions
diff --git a/‎example.py‎
Lines changed: 56 additions & 0 deletions b/‎example.py‎
Lines changed: 56 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions b/‎pyproject.toml‎
Lines changed: 1 addition & 0 deletions
@@ -49,5 +49,6 @@ coverage.xml
 *.swo
 
 # Local model cache (downloads and split layers) — do not commit
-/models/
-/.models/
+/.models/
+
+examples
@@ -2,6 +2,20 @@
 
 All notable changes to RabbitLLM are documented here.
 
+## [1.1.0]
+
+### Added
+- **GPU Direct Storage (kvikio)**: When `kvikio-cu12` is installed, layers load directly from disk to GPU,
+  bypassing CPU and pin_memory. Install with `pip install rabbitllm[gds]`.
+- **DiskKVCache**: `kv_cache_dir` option to offload KV cache to SSD for 50k+ token contexts.
+- **example.py** in project root for quick onboarding.
+- **samples/** directory with sample text for long-context testing.
+- `use_gds` parameter (default `True`) to enable/disable kvikio when available.
+
+### Changed
+- `load_layer_to_cpu` now tries kvikio (GDS) first when available and compression is not used.
+- README documents `use_gds`, `kv_cache_dir`, and the optional `[gds]` extra.
+
 ## [1.0.1] — 2026-02-22
 
 ### Fixed
 
@@ -1,7 +1,7 @@
 .PHONY: install dev lint format test test-cov typecheck clean bash
 
 install:
-	uv sync
+	uv sync --extra gds
 
 dev: install
 
 
@@ -130,6 +130,8 @@ model = AutoModel.from_pretrained(
     max_seq_len=512,             # maximum sequence length
     prefetching=True,            # overlap layer loading with compute
     prefetch_pin_memory=True,    # faster CPU→GPU for small/medium models
+    use_gds=True,                # GPU Direct Storage (kvikio) when available
+    kv_cache_dir=None,           # path to offload KV cache for long context (50k+ tokens)
     token="hf_...",              # HuggingFace token for gated repos
     layer_shards_saving_path="/path/to/cache",  # custom split cache directory
     profiling_mode=False,        # print per-layer timing
@@ -150,6 +152,41 @@ model = AutoModel.from_pretrained("Qwen/Qwen2.5-0.5B-Instruct", compression="4bi
 
 Requires `bitsandbytes`: `pip install bitsandbytes`.
 
+### GPU Direct Storage (optional)
+
+For CUDA without compression, install `kvikio-cu12` to load layers directly from disk to GPU,
+bypassing CPU and pin_memory (can significantly speed up 70B+ models):
+
+```bash
+pip install rabbitllm[gds]
+# or: pip install kvikio-cu12
+```
+
+Set `use_gds=False` to disable.
+
+### Long context (KV cache on disk)
+
+For 50k+ token contexts, pass `kv_cache_dir` to offload KV cache to SSD:
+
+```python
+model = AutoModel.from_pretrained("Qwen/Qwen2.5-72B-Instruct", kv_cache_dir="./kv_cache")
+```
+
+### Benchmarking improvements
+
+To measure GDS and DiskKVCache improvements:
+
+```bash
+# Local: make install pulls in kvikio (--extra gds)
+make install
+uv run python scripts/benchmark_improvements.py --mode gds
+uv run python scripts/benchmark_improvements.py --mode long_context
+
+# Docker (make bash): install with GDS first
+pip install -e ".[gds]"
+python scripts/benchmark_improvements.py --mode gds
+```
+
 ### Gated models
 
 Pass a HuggingFace token for repos that require access approval:
 
@@ -0,0 +1,56 @@
+#!/usr/bin/env python3
+"""
+RabbitLLM example — minimal inference script.
+
+Run: python example.py
+Or:  uv run python example.py
+
+Uses a small model (Qwen2.5-0.5B) for fast testing. For larger models or long
+context, see scripts/quickstart.py and the Configuration section in README.
+"""
+
+import warnings
+
+import torch
+from rabbitllm import AutoModel
+
+with warnings.catch_warnings():
+    warnings.filterwarnings("ignore", message=".*CUDA.*unknown error.*", category=UserWarning)
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+
+model = AutoModel.from_pretrained(
+    "Qwen/Qwen2.5-0.5B-Instruct",
+    device=device,
+    compression="4bit",
+)
+
+messages = [
+    {"role": "system", "content": "You are a helpful assistant."},
+    {"role": "user", "content": "What is 2 + 2? Answer briefly."},
+]
+
+input_text = model.tokenizer.apply_chat_template(
+    messages, tokenize=False, add_generation_prompt=True
+)
+tokens = model.tokenizer(
+    [input_text], return_tensors="pt", truncation=True, max_length=512
+)
+input_ids = tokens["input_ids"].to(device)
+attention_mask = tokens.get("attention_mask")
+if attention_mask is None:
+    attention_mask = torch.ones_like(input_ids, dtype=torch.long, device=device)
+else:
+    attention_mask = attention_mask.to(device)
+
+output = model.generate(
+    input_ids,
+    attention_mask=attention_mask,
+    max_new_tokens=64,
+    use_cache=True,
+    do_sample=True,
+    temperature=0.6,
+    return_dict_in_generate=True,
+)
+
+input_len = tokens["input_ids"].shape[1]
+print(model.tokenizer.decode(output.sequences[0][input_len:], skip_special_tokens=True))
@@ -36,6 +36,7 @@ classifiers = [
 [project.optional-dependencies]
 compression = ["bitsandbytes"]
 flash = ["flash-attn>=2.5"]
+gds = ["kvikio-cu12"]
 server = []
 
 [tool.hatch.build.targets.wheel]