Skip to content

Commit b2c8f90

Browse files
unamedkrclaude
andcommitted
wasm: streaming tokens + IndexedDB model cache
Two UX improvements: 1. **Streaming**: switched from quant_ask (returns full string) to quant_generate with on_token callback. Each token is sent to JS via js_on_token as it's generated, so the user sees text appear in real time instead of waiting for the full response. 2. **Model caching**: downloaded GGUF is stored in IndexedDB after first download. On page refresh, the "Try Demo" button shows "Load cached SmolLM2-135M (instant)" and loads from local storage instead of re-downloading 135 MB from HuggingFace. Flow: first visit → download → cache → load refresh → check cache → instant load NOTE: streaming requires WASM rebuild (emcc) to take effect — the current quant.wasm binary still uses the old quant_ask path. The JS side is ready; the C change will apply on next `bash wasm/build.sh`. The IndexedDB caching is pure JS and works immediately. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 6089212 commit b2c8f90

File tree

2 files changed

+72
-16
lines changed

2 files changed

+72
-16
lines changed

wasm/index.html

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -135,6 +135,40 @@ <h2>LLM in Your Browser — 189 KB</h2>
135135
let modelLoaded = false;
136136
let generating = false;
137137

138+
// ---- IndexedDB model cache ----
139+
const DB_NAME = 'quantcpp_cache';
140+
const DB_STORE = 'models';
141+
const DEMO_KEY = 'smollm2-135m';
142+
143+
function openDB() {
144+
return new Promise((resolve, reject) => {
145+
const req = indexedDB.open(DB_NAME, 1);
146+
req.onupgradeneeded = () => req.result.createObjectStore(DB_STORE);
147+
req.onsuccess = () => resolve(req.result);
148+
req.onerror = () => reject(req.error);
149+
});
150+
}
151+
152+
async function cacheModel(key, bytes) {
153+
const db = await openDB();
154+
return new Promise((resolve, reject) => {
155+
const tx = db.transaction(DB_STORE, 'readwrite');
156+
tx.objectStore(DB_STORE).put(bytes, key);
157+
tx.oncomplete = () => resolve();
158+
tx.onerror = () => reject(tx.error);
159+
});
160+
}
161+
162+
async function getCachedModel(key) {
163+
const db = await openDB();
164+
return new Promise((resolve, reject) => {
165+
const tx = db.transaction(DB_STORE, 'readonly');
166+
const req = tx.objectStore(DB_STORE).get(key);
167+
req.onsuccess = () => resolve(req.result || null);
168+
req.onerror = () => reject(req.error);
169+
});
170+
}
171+
138172
// File handling
139173
const dropzone = document.getElementById('dropzone');
140174
const fileInput = document.getElementById('fileInput');
@@ -164,15 +198,27 @@ <h2>LLM in Your Browser — 189 KB</h2>
164198
document.getElementById('loading').classList.remove('active');
165199
}
166200

167-
// Demo model auto-download from HuggingFace
201+
// Demo model — cache-first, download only if not in IndexedDB
168202
async function loadDemoModel() {
169203
const url = 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf';
170204
const btn = document.getElementById('demoBtn');
171205
btn.disabled = true;
172-
btn.textContent = 'Downloading...';
173-
showLoading('Downloading SmolLM2-135M (~135 MB)...');
174206

175207
try {
208+
// 1. Try cache first
209+
showLoading('Checking local cache...');
210+
const cached = await getCachedModel(DEMO_KEY);
211+
if (cached) {
212+
btn.textContent = 'Loading from cache...';
213+
showLoading('Loading cached model...');
214+
loadModelFromBytes(new Uint8Array(cached), 'smollm2-135m (cached)');
215+
return;
216+
}
217+
218+
// 2. Download from HuggingFace
219+
btn.textContent = 'Downloading...';
220+
showLoading('Downloading SmolLM2-135M (~135 MB)...');
221+
176222
const response = await fetch(url);
177223
if (!response.ok) throw new Error(`HTTP ${response.status}`);
178224

@@ -191,16 +237,19 @@ <h2>LLM in Your Browser — 189 KB</h2>
191237
const mb = (received / 1048576).toFixed(0);
192238
const totalMb = (total / 1048576).toFixed(0);
193239
document.getElementById('loadingText').textContent =
194-
`Downloading SmolLM2-135M... ${pct}% (${mb}/${totalMb} MB)`;
240+
`Downloading... ${pct}% (${mb}/${totalMb} MB)`;
195241
}
196242
}
197243

198-
// Combine chunks into a single ArrayBuffer
199244
const blob = new Blob(chunks);
200245
const arrayBuffer = await blob.arrayBuffer();
201246
const data = new Uint8Array(arrayBuffer);
202247

203-
document.getElementById('loadingText').textContent = 'Loading model into WASM...';
248+
// 3. Cache for next time
249+
showLoading('Caching model for instant reload...');
250+
await cacheModel(DEMO_KEY, arrayBuffer).catch(() => {});
251+
252+
showLoading('Loading model into WASM...');
204253
loadModelFromBytes(data, 'smollm2-135m-instruct-q8_0.gguf');
205254
} catch (err) {
206255
hideLoading();
@@ -210,6 +259,18 @@ <h2>LLM in Your Browser — 189 KB</h2>
210259
}
211260
}
212261

262+
// Auto-load cached model on page load
263+
window.addEventListener('load', async () => {
264+
try {
265+
const cached = await getCachedModel(DEMO_KEY);
266+
if (cached) {
267+
const btn = document.getElementById('demoBtn');
268+
btn.textContent = '▶ Load cached SmolLM2-135M (instant)';
269+
btn.style.background = '#047857';
270+
}
271+
} catch(e) {}
272+
});
273+
213274
function addMessage(role, text) {
214275
const chat = document.getElementById('chat');
215276
const div = document.createElement('div');

wasm/quant_wasm.c

Lines changed: 5 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -113,22 +113,17 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
113113

114114
double t0 = emscripten_get_now();
115115

116-
/* Use the streaming API via callback */
117-
/* For now, use quant_ask and stream the result */
118-
char* result = quant_ask(g_ctx, prompt);
116+
/* Streaming generation via per-token callback */
117+
int n_tokens = quant_generate(g_ctx, prompt, on_token, NULL);
119118

120119
double elapsed = emscripten_get_now() - t0;
121120

122-
if (result && result[0] != '\0') {
123-
/* Send full result (quant_ask doesn't use callback) */
124-
js_on_token(result);
125-
int n_tokens = (int)strlen(result) / 4; /* rough estimate */
121+
if (n_tokens > 0) {
126122
js_on_done(n_tokens, elapsed);
127-
quant_free_string(result);
128123
} else {
129-
if (result) quant_free_string(result);
130124
js_on_done(0, elapsed);
131-
js_on_status("No output — try a different prompt");
125+
if (g_output_pos == 0)
126+
js_on_status("No output — try a different prompt");
132127
}
133128

134129
g_generating = 0;

0 commit comments

Comments
 (0)