wasm: fix generation hang — add ChatML template + async generate

unamedkr · claude · unamedkr · commit 6089212f3e57 · 2026-04-10T00:05:22.000+09:00
Two bugs causing WASM demo to hang after model loads:

1. **Missing chat template**: user prompt sent raw ("hello?") without
   ChatML wrapping. SmolLM2-Instruct generates 0 tokens without the
   &lt;|im_start|&gt;user/assistant template. Same bug we fixed in Python
   bindings (v0.8.3). Fix: JS wraps prompt with ChatML before calling
   wasm_generate.

2. **UI freeze**: wasm_generate is synchronous (blocks main thread).
   The browser can't update the UI while inference runs. Fix: wrap the
   WASM call in setTimeout(50ms) to yield one frame for the spinner.

Also fixed: free(result) → quant_free_string(result) for consistency
with the cross-heap safety pattern, and better empty-result handling.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/wasm/index.html b/wasm/index.html
@@ -300,16 +300,21 @@ <h2>LLM in Your Browser — 189 KB</h2>
         addMessage('system', msg);
     };
 
-    // Call generation
-    const promptPtr = Module.allocateUTF8(text);
-    Module._wasm_generate(promptPtr, 0.7, 256);
-    Module._free(promptPtr);
+    // Wrap with ChatML template (instruct models need this to generate)
+    const chatPrompt = `<|im_start|>user\n${text}<|im_end|>\n<|im_start|>assistant\n`;
 
-    if (!output) {
-        assistantDiv.innerHTML = '<em style="color:#666">No output generated</em>';
-    }
-    generating = false;
-    document.getElementById('sendBtn').disabled = false;
+    // Run generation asynchronously so the UI doesn't freeze
+    setTimeout(() => {
+        const promptPtr = Module.allocateUTF8(chatPrompt);
+        Module._wasm_generate(promptPtr, 0.7, 256);
+        Module._free(promptPtr);
+
+        if (!output) {
+            assistantDiv.innerHTML = '<em style="color:#666">No output generated. Try a longer prompt.</em>';
+        }
+        generating = false;
+        document.getElementById('sendBtn').disabled = false;
+    }, 50);  // yield to browser for one frame to show the spinner
 }
 </script>
 
diff --git a/wasm/quant_wasm.c b/wasm/quant_wasm.c
@@ -119,14 +119,16 @@ int wasm_generate(const char* prompt, float temperature, int max_tokens) {
 
     double elapsed = emscripten_get_now() - t0;
 
-    if (result) {
+    if (result && result[0] != '\0') {
         /* Send full result (quant_ask doesn't use callback) */
         js_on_token(result);
         int n_tokens = (int)strlen(result) / 4; /* rough estimate */
         js_on_done(n_tokens, elapsed);
-        free(result);
+        quant_free_string(result);
     } else {
-        js_on_status("Generation failed");
+        if (result) quant_free_string(result);
+        js_on_done(0, elapsed);
+        js_on_status("No output — try a different prompt");
     }
 
     g_generating = 0;