wasm: streaming tokens + IndexedDB model cache

unamedkr · claude · unamedkr · commit b2c8f90afe92 · 2026-04-10T00:11:09.000+09:00
Two UX improvements:

1. **Streaming**: switched from quant_ask (returns full string) to
   quant_generate with on_token callback. Each token is sent to JS
   via js_on_token as it's generated, so the user sees text appear
   in real time instead of waiting for the full response.

2. **Model caching**: downloaded GGUF is stored in IndexedDB after
   first download. On page refresh, the "Try Demo" button shows
   "Load cached SmolLM2-135M (instant)" and loads from local storage
   instead of re-downloading 135 MB from HuggingFace.

   Flow: first visit → download → cache → load
         refresh   → check cache → instant load

NOTE: streaming requires WASM rebuild (emcc) to take effect — the
current quant.wasm binary still uses the old quant_ask path. The JS
side is ready; the C change will apply on next `bash wasm/build.sh`.
The IndexedDB caching is pure JS and works immediately.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/wasm/index.html b/wasm/index.html
@@ -135,6 +135,40 @@ <h2>LLM in Your Browser — 189 KB</h2>
 let modelLoaded = false;
 let generating = false;
 
+// ---- IndexedDB model cache ----
+const DB_NAME = 'quantcpp_cache';
+const DB_STORE = 'models';
+const DEMO_KEY = 'smollm2-135m';
+
+function openDB() {
+    return new Promise((resolve, reject) => {
+        const req = indexedDB.open(DB_NAME, 1);
+        req.onupgradeneeded = () => req.result.createObjectStore(DB_STORE);
+        req.onsuccess = () => resolve(req.result);
+        req.onerror = () => reject(req.error);
+    });
+}
+
+async function cacheModel(key, bytes) {
+    const db = await openDB();
+    return new Promise((resolve, reject) => {
+        const tx = db.transaction(DB_STORE, 'readwrite');
+        tx.objectStore(DB_STORE).put(bytes, key);
+        tx.oncomplete = () => resolve();
+        tx.onerror = () => reject(tx.error);
+    });
+}
+
+async function getCachedModel(key) {
+    const db = await openDB();
+    return new Promise((resolve, reject) => {
+        const tx = db.transaction(DB_STORE, 'readonly');
+        const req = tx.objectStore(DB_STORE).get(key);
+        req.onsuccess = () => resolve(req.result || null);
+        req.onerror = () => reject(req.error);
+    });
+}
+
 // File handling
 const dropzone = document.getElementById('dropzone');
 const fileInput = document.getElementById('fileInput');
@@ -164,15 +198,27 @@ <h2>LLM in Your Browser — 189 KB</h2>
     document.getElementById('loading').classList.remove('active');
 }
 
-// Demo model auto-download from HuggingFace
+// Demo model — cache-first, download only if not in IndexedDB
 async function loadDemoModel() {
     const url = 'https://huggingface.co/Felladrin/gguf-Q8_0-SmolLM2-135M-Instruct/resolve/main/smollm2-135m-instruct-q8_0.gguf';
     const btn = document.getElementById('demoBtn');
     btn.disabled = true;
-    btn.textContent = 'Downloading...';
-    showLoading('Downloading SmolLM2-135M (~135 MB)...');
 
     try {
+        // 1. Try cache first
+        showLoading('Checking local cache...');
+        const cached = await getCachedModel(DEMO_KEY);
+        if (cached) {
+            btn.textContent = 'Loading from cache...';
+            showLoading('Loading cached model...');
+            loadModelFromBytes(new Uint8Array(cached), 'smollm2-135m (cached)');
+            return;
+        }
+
+        // 2. Download from HuggingFace
+        btn.textContent = 'Downloading...';
+        showLoading('Downloading SmolLM2-135M (~135 MB)...');
+
         const response = await fetch(url);
         if (!response.ok) throw new Error(`HTTP ${response.status}`);
 
@@ -191,16 +237,19 @@ <h2>LLM in Your Browser — 189 KB</h2>
                 const mb = (received / 1048576).toFixed(0);
                 const totalMb = (total / 1048576).toFixed(0);
                 document.getElementById('loadingText').textContent =
-                    `Downloading SmolLM2-135M... ${pct}% (${mb}/${totalMb} MB)`;
+                    `Downloading... ${pct}% (${mb}/${totalMb} MB)`;
             }
         }
 
-        // Combine chunks into a single ArrayBuffer
         const blob = new Blob(chunks);
         const arrayBuffer = await blob.arrayBuffer();
         const data = new Uint8Array(arrayBuffer);
 
-        document.getElementById('loadingText').textContent = 'Loading model into WASM...';
+        // 3. Cache for next time
+        showLoading('Caching model for instant reload...');
+        await cacheModel(DEMO_KEY, arrayBuffer).catch(() => {});
+
+        showLoading('Loading model into WASM...');
         loadModelFromBytes(data, 'smollm2-135m-instruct-q8_0.gguf');
     } catch (err) {
         hideLoading();
@@ -210,6 +259,18 @@ <h2>LLM in Your Browser — 189 KB</h2>
     }
 }
 
+// Auto-load cached model on page load
+window.addEventListener('load', async () => {
+    try {
+        const cached = await getCachedModel(DEMO_KEY);
+        if (cached) {
+            const btn = document.getElementById('demoBtn');
+            btn.textContent = '▶ Load cached SmolLM2-135M (instant)';
+            btn.style.background = '#047857';
+        }
+    } catch(e) {}
+});
+
 function addMessage(role, text) {
     const chat = document.getElementById('chat');
     const div = document.createElement('div');
diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c
@@ -113,22 +113,17 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
 
     double t0 = emscripten_get_now();
 
-    /* Use the streaming API via callback */
-    /* For now, use quant_ask and stream the result */
-    char* result = quant_ask(g_ctx, prompt);
+    /* Streaming generation via per-token callback */
+    int n_tokens = quant_generate(g_ctx, prompt, on_token, NULL);
 
     double elapsed = emscripten_get_now() - t0;
 
-    if (result && result[0] != '\0') {
-        /* Send full result (quant_ask doesn't use callback) */
-        js_on_token(result);
-        int n_tokens = (int)strlen(result) / 4; /* rough estimate */
+    if (n_tokens > 0) {
         js_on_done(n_tokens, elapsed);
-        quant_free_string(result);
     } else {
-        if (result) quant_free_string(result);
         js_on_done(0, elapsed);
-        js_on_status("No output — try a different prompt");
+        if (g_output_pos == 0)
+            js_on_status("No output — try a different prompt");
     }
 
     g_generating = 0;