Pushkinist · Pushkinist · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/crates/rmlx-server/src/engine/image.rs b/crates/rmlx-server/src/engine/image.rs
@@ -15,6 +15,61 @@ use rmlx_core::Error;
 pub(crate) use rmlx_models::gemma4::BOI_TOKEN_ID as GEMMA4_BOI_TOKEN_ID;
 pub(crate) use rmlx_models::gemma4::EOI_TOKEN_ID as GEMMA4_EOI_TOKEN_ID;
 
+/// Gemma4 user-turn opener `<|turn>user\n` (token ids `<|turn>`=105, `user`=2364,
+/// `\n`=107). The image block splices in right after this so the image lands
+/// *inside* the user turn it belongs to — matching the HF/mlx-vlm processor,
+/// which substitutes the `<|image|>` placeholder in place within the user
+/// message rather than before the turn.
+pub(crate) const GEMMA4_USER_TURN_OPENER: [u32; 3] = [105, 2364, 107];
+
+/// Splice per-image token blocks (`<boi>` + N×image-token + `<eoi>`) into the
+/// prompt so the image conditions the turn it belongs to.
+///
+/// The image content item is part of the user message, so the block must land
+/// *inside* the user turn — right after `turn_opener` (the per-arch user-turn
+/// opener token sequence, e.g. [`GEMMA4_USER_TURN_OPENER`]). This mirrors the
+/// HF / mlx-vlm processor, which substitutes the `<|image|>` placeholder in
+/// place. Inserting the block *before* the user turn (after BOS) leaves the
+/// image outside the question and makes the model frequently fail to ground on
+/// it — image-independent, repetitive output.
+///
+/// The **last** opener match is used so a multi-turn prompt splices into the
+/// final user turn (the one the image accompanies), not an earlier one. When
+/// `turn_opener` is empty or not found (non-standard template), fall back to
+/// inserting after the prompt's leading BOS token.
+#[allow(
+    clippy::indexing_slicing,
+    reason = "insert_at is bounded by prompt_tokens.len(): it is either 0/1 (after-BOS) \
+              or a window position plus the matched opener length, all within bounds. \
+              The early-return to after_bos when turn_opener.is_empty() prevents windows(0) \
+              (which would panic); the turn_opener.len() > prompt_tokens.len() guard prevents \
+              the oversized-opener case where windows() would yield no results."
+)]
+pub(crate) fn splice_image_block(
+    prompt_tokens: &[u32],
+    blocks: &[Vec<u32>],
+    turn_opener: &[u32],
+) -> Vec<u32> {
+    let total: usize = blocks.iter().map(Vec::len).sum();
+    let after_bos = usize::from(!prompt_tokens.is_empty());
+    let insert_at = if turn_opener.is_empty() || turn_opener.len() > prompt_tokens.len() {
+        after_bos
+    } else {
+        prompt_tokens
+            .windows(turn_opener.len())
+            .enumerate()
+            .rfind(|(_, w)| *w == turn_opener)
+            .map_or(after_bos, |(pos, _)| pos + turn_opener.len())
+    };
+    let mut aug = Vec::with_capacity(prompt_tokens.len() + total);
+    aug.extend_from_slice(&prompt_tokens[..insert_at]);
+    for b in blocks {
+        aug.extend_from_slice(b);
+    }
+    aug.extend_from_slice(&prompt_tokens[insert_at..]);
+    aug
+}
+
 /// bundle of the multimodal components needed to turn image bytes
 /// into scattered `inputs_embeds`. Loaded once per model. One variant per
 /// vision-capable architecture (Gemma4's custom SigLIP, Gemma3's standard SigLIP).
@@ -70,25 +125,6 @@ pub(crate) fn build_image_prompt(
     mm_cache: Option<&rmlx_models::multimodal_cache::MultimodalCache>,
     model_sig: u64,
 ) -> rmlx_core::Result<(Vec<u32>, rmlx_mlx::Array, rmlx_mlx::Array)> {
-    // Splice per-image token blocks (`<boi>` + N×image-token + `<eoi>`) in after
-    // the prompt's leading BOS token so the image conditions the whole turn
-    // causally. Shared between archs; only the token ids + soft counts differ.
-    #[allow(
-        clippy::indexing_slicing,
-        reason = "bounds established by construction: buffer sized at init, loop indices bounded by slice length, or validated before call"
-    )]
-    fn splice(prompt_tokens: &[u32], blocks: &[Vec<u32>]) -> Vec<u32> {
-        let total: usize = blocks.iter().map(Vec::len).sum();
-        let insert_at = usize::from(!prompt_tokens.is_empty());
-        let mut aug = Vec::with_capacity(prompt_tokens.len() + total);
-        aug.extend_from_slice(&prompt_tokens[..insert_at]);
-        for b in blocks {
-            aug.extend_from_slice(b);
-        }
-        aug.extend_from_slice(&prompt_tokens[insert_at..]);
-        aug
-    }
-
     match vb {
         VisionBundle::Gemma4 {
             vision,
@@ -126,7 +162,7 @@ pub(crate) fn build_image_prompt(
                     b
                 })
                 .collect();
-            let aug_ids = splice(prompt_tokens, &blocks);
+            let aug_ids = splice_image_block(prompt_tokens, &blocks, &GEMMA4_USER_TURN_OPENER);
 
             let total_soft: usize = pixels.iter().map(|p| p.num_soft_tokens).sum();
             let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count();
@@ -185,7 +221,7 @@ pub(crate) fn build_image_prompt(
                     b
                 })
                 .collect();
-            let aug_ids = splice(prompt_tokens, &blocks);
+            let aug_ids = splice_image_block(prompt_tokens, &blocks, &GEMMA4_USER_TURN_OPENER);
 
             let total_soft: usize = pixels.iter().map(|(_, n)| *n).sum();
             let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count();
@@ -240,7 +276,9 @@ pub(crate) fn build_image_prompt(
                     b
                 })
                 .collect();
-            let aug_ids = splice(prompt_tokens, &blocks);
+            // Gemma3's user-turn opener ids are not verified against a snapshot
+            // here, so keep the after-BOS fallback (empty opener) for it.
+            let aug_ids = splice_image_block(prompt_tokens, &blocks, &[]);
 
             let total_soft: usize = pixels.iter().map(|p| p.num_soft_tokens).sum();
             let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count();
@@ -274,6 +312,11 @@ pub(crate) const QWEN3VL_VISION_END_ID: u32 = 151_653;
 /// Qwen3-VL-MoE `<|image_pad|>` token id (the scatter target).
 pub(crate) const QWEN3VL_IMAGE_PAD_ID: u32 = 151_655;
 
+/// Qwen3-VL-MoE user-turn opener `<|im_start|>user\n` (token ids 151644, 872, 198).
+/// The vision block splices in right after this, matching the mlx-vlm prompt format
+/// `<|im_start|>user\n<|vision_start|>...<|vision_end|><text><|im_end|>`.
+pub(crate) const QWEN3VL_USER_TURN_OPENER: [u32; 3] = [151_644, 872, 198];
+
 /// full Qwen3-VL-MoE image request: preprocess images, run the ViT,
 /// splice the per-image vision block (`<|vision_start|>` + N×`<|image_pad|>` +
 /// `<|vision_end|>`) after the prompt's leading token, then decode via the
@@ -451,11 +494,11 @@ pub(crate) fn run_qwen3vl_image(
 
     // Splice the vision block `<|vision_start|>` + N×`<|image_pad|>` +
     // `<|vision_end|>` immediately after the user-turn opener `<|im_start|>user\n`
-    // (token ids 151644, 872, 198) and before the user text — matching the
-    // mlx-vlm Qwen3-VL prompt format
-    // `<|im_start|>user\n<|vision_start|>...<|vision_end|><text><|im_end|>`. If
-    // that opener is not found (non-standard template), fall back to inserting
-    // after the leading token.
+    // — matching the mlx-vlm Qwen3-VL prompt format
+    // `<|im_start|>user\n<|vision_start|>...<|vision_end|><text><|im_end|>`. The
+    // last opener match is used so a multi-turn prompt splices into the final user
+    // turn. If the opener is not found (non-standard template), falls back to
+    // inserting after the leading token.
     let mut block = Vec::with_capacity(pv.num_soft_tokens + 2);
     block.push(QWEN3VL_VISION_START_ID);
     block.extend(std::iter::repeat_n(
@@ -464,17 +507,7 @@ pub(crate) fn run_qwen3vl_image(
     ));
     block.push(QWEN3VL_VISION_END_ID);
 
-    const IM_START: u32 = 151_644;
-    const USER: u32 = 872;
-    const NL: u32 = 198;
-    let insert_at = prompt_tokens
-        .windows(3)
-        .position(|w| w == [IM_START, USER, NL])
-        .map_or(usize::from(!prompt_tokens.is_empty()), |p| p + 3);
-    let mut aug_ids = Vec::with_capacity(prompt_tokens.len() + block.len());
-    aug_ids.extend_from_slice(&prompt_tokens[..insert_at]);
-    aug_ids.extend_from_slice(&block);
-    aug_ids.extend_from_slice(&prompt_tokens[insert_at..]);
+    let aug_ids = splice_image_block(prompt_tokens, &[block], &QWEN3VL_USER_TURN_OPENER);
 
     let in_prompt = aug_ids
         .iter()
@@ -514,3 +547,7 @@ pub(crate) fn rmlx_server_load_image(source: &str) -> rmlx_core::Result<Vec<u8>>
     crate::image_io::load_image(source, std::time::Duration::from_secs(20))
         .map_err(|e| Error::Other(format!("image load failed: {e}")))
 }
+
+#[cfg(test)]
+#[path = "image_tests.rs"]
+mod image_tests;
diff --git a/crates/rmlx-server/src/engine/image_tests.rs b/crates/rmlx-server/src/engine/image_tests.rs
@@ -0,0 +1,174 @@
+//! Image-prompt splice-placement tests.
+//!
+//! These cover [`splice_image_block`] — the token-level placement of the
+//! per-image `<boi> + N×image + <eoi>` block. The end-to-end vision decode is
+//! exercised by the real-model integration path; here we assert the block lands
+//! *inside* the user turn (the bug this logic fixes was inserting it before the
+//! turn, leaving the image outside the question).
+#![allow(
+    clippy::indexing_slicing,
+    reason = "test fixtures: index/slice bounds are fixed by the literal prompt arrays constructed in each test"
+)]
+
+use super::{splice_image_block, GEMMA4_USER_TURN_OPENER, QWEN3VL_USER_TURN_OPENER};
+
+/// A minimal image block placeholder (real blocks are `<boi> + N×img + <eoi>`;
+/// the placement logic is independent of the block contents).
+fn block(marker: u32, n: usize) -> Vec<u32> {
+    let mut b = vec![900];
+    b.extend(std::iter::repeat_n(marker, n));
+    b.push(901);
+    b
+}
+
+#[test]
+fn gemma4_block_inserts_inside_user_turn() {
+    // <bos> <|turn> user \n  "hi"  <turn|> \n
+    let bos = 2u32;
+    let opener = GEMMA4_USER_TURN_OPENER; // [105, 2364, 107]
+    let prompt = [bos, opener[0], opener[1], opener[2], 555, 106, 107];
+    let blk = block(258_880, 3);
+    let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &opener);
+
+    // The block must appear AFTER the opener and BEFORE the user text token 555.
+    let opener_end = 4; // bos + 3 opener tokens
+    assert_eq!(
+        &aug[..opener_end],
+        &prompt[..opener_end],
+        "prefix preserved"
+    );
+    assert_eq!(
+        &aug[opener_end..opener_end + blk.len()],
+        blk.as_slice(),
+        "block spliced right after the user-turn opener"
+    );
+    assert_eq!(
+        aug[opener_end + blk.len()],
+        555,
+        "user text follows the block (image is inside the turn, before the text)"
+    );
+}
+
+#[test]
+fn gemma4_block_uses_last_user_turn_in_multi_turn() {
+    let bos = 2u32;
+    let o = GEMMA4_USER_TURN_OPENER;
+    // two user turns; image accompanies the second (last) one.
+    let prompt = [
+        bos, o[0], o[1], o[2], 10, 106, 107, // turn 1: user "10"
+        o[0], o[1], o[2], 20, 106, 107, // turn 2: user "20"
+    ];
+    let blk = block(258_880, 2);
+    let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &o);
+
+    // Insert position is after the SECOND opener (index 7 + 3 = 10).
+    let second_opener_end = 10;
+    assert_eq!(
+        &aug[second_opener_end..second_opener_end + blk.len()],
+        blk.as_slice(),
+        "block spliced into the final user turn, not the first"
+    );
+    assert_eq!(
+        aug[second_opener_end + blk.len()],
+        20,
+        "second turn text follows"
+    );
+    // The first turn must be untouched (no block before token 10).
+    assert_eq!(&aug[..second_opener_end], &prompt[..second_opener_end]);
+}
+
+#[test]
+fn falls_back_to_after_bos_when_opener_absent() {
+    let bos = 2u32;
+    let prompt = [bos, 500, 501, 502];
+    let blk = block(258_880, 2);
+    // Empty opener forces the after-BOS fallback (e.g. the Gemma3 path).
+    let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &[]);
+    assert_eq!(aug[0], bos, "BOS stays first");
+    assert_eq!(
+        &aug[1..=blk.len()],
+        blk.as_slice(),
+        "block spliced right after BOS"
+    );
+    assert_eq!(
+        &aug[1 + blk.len()..],
+        &prompt[1..],
+        "rest of prompt preserved"
+    );
+
+    // A non-empty opener that is not present also falls back to after-BOS.
+    let aug2 = splice_image_block(
+        &prompt,
+        std::slice::from_ref(&blk),
+        &GEMMA4_USER_TURN_OPENER,
+    );
+    assert_eq!(
+        &aug2[1..=blk.len()],
+        blk.as_slice(),
+        "opener absent → after-BOS"
+    );
+}
+
+#[test]
+fn empty_prompt_inserts_block_at_front() {
+    let blk = block(258_880, 1);
+    let aug = splice_image_block(&[], std::slice::from_ref(&blk), &GEMMA4_USER_TURN_OPENER);
+    assert_eq!(aug, blk, "empty prompt → block is the whole sequence");
+}
+
+#[test]
+fn multi_image_blocks_appear_in_order_and_contiguous() {
+    // Two blocks passed together must land in-order and contiguous at the splice
+    // point — no interleaving, no reversal.
+    let bos = 2u32;
+    let o = GEMMA4_USER_TURN_OPENER;
+    let prompt = [bos, o[0], o[1], o[2], 555, 106, 107];
+    let blk1 = block(111, 2); // first image block
+    let blk2 = block(222, 3); // second image block
+    let aug = splice_image_block(&prompt, &[blk1.clone(), blk2.clone()], &o);
+
+    let opener_end = 4usize; // bos + 3 opener tokens
+                             // Both blocks appear right after the opener, blk1 first, blk2 immediately after.
+    assert_eq!(
+        &aug[opener_end..opener_end + blk1.len()],
+        blk1.as_slice(),
+        "first block lands first"
+    );
+    assert_eq!(
+        &aug[opener_end + blk1.len()..opener_end + blk1.len() + blk2.len()],
+        blk2.as_slice(),
+        "second block immediately follows first"
+    );
+    // User text token 555 follows both blocks without any prompt tokens in between.
+    assert_eq!(
+        aug[opener_end + blk1.len() + blk2.len()],
+        555,
+        "user text follows both blocks"
+    );
+    // Total length is consistent.
+    assert_eq!(
+        aug.len(),
+        prompt.len() + blk1.len() + blk2.len(),
+        "length adds up"
+    );
+}
+
+#[test]
+fn qwen3vl_single_turn_splice_matches_last_opener() {
+    // Verify that routing Qwen3-VL through splice_image_block (last-match)
+    // produces the same insert position as the former first-match code for a
+    // single-turn prompt (first == last when there is only one opener).
+    let bos = 1u32;
+    let o = QWEN3VL_USER_TURN_OPENER; // [151644, 872, 198]
+    let prompt = [bos, o[0], o[1], o[2], 999, 151_645];
+    let blk = block(151_655, 4);
+    let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &o);
+
+    let opener_end = 4usize; // bos + 3 opener tokens
+    assert_eq!(
+        &aug[opener_end..opener_end + blk.len()],
+        blk.as_slice(),
+        "vision block lands right after the user-turn opener"
+    );
+    assert_eq!(aug[opener_end + blk.len()], 999, "user text follows");
+}
diff --git a/docs/MODELS.md b/docs/MODELS.md
@@ -697,6 +697,18 @@ are f32) — adopt / restore the operand dtype, mirroring mlx-lm's weak-typed
 Python floats. A unit-level dtype-lock test guards this against regression; see
 docs/KV_QUANT.md "Gemma4 global `--kv-quant none` KV is bf16".
 
+**Image-prompt placement (inside the user turn).** The per-image block
+(`<boi>` + N × `<image_soft_token>` + `<eoi>`) is spliced **inside** the user
+turn — immediately after the user-turn opener `<|turn>user\n` (token ids
+`[105, 2364, 107]`) — matching the HF/mlx-vlm processor, which substitutes the
+`<|image|>` placeholder in place within the user message. Placing the block
+before the turn (after BOS) leaves the image outside the question and makes the
+model frequently fail to ground on it (image-independent, repetitive output —
+e.g. "you have not provided an image"); the effect is prompt-phrasing-dependent
+and hits low-precision QAT-fp4 vision snapshots hardest. The last user-turn
+opener is used so multi-turn prompts splice into the final user turn. When the
+opener is absent (non-standard template) the splice falls back to after-BOS.
+
 **Conformer audio encoder + native audio input.** Present in e4b and above. The
 SSCP subsampling (two conv stages reducing the time dimension by 4×),
 Macaron-style FFW blocks with `residual_weight=0.5`, chunked local attention, and