diff --git a/crates/rmlx-server/src/engine/image.rs b/crates/rmlx-server/src/engine/image.rs index 0d55549..742b627 100644 --- a/crates/rmlx-server/src/engine/image.rs +++ b/crates/rmlx-server/src/engine/image.rs @@ -15,6 +15,61 @@ use rmlx_core::Error; pub(crate) use rmlx_models::gemma4::BOI_TOKEN_ID as GEMMA4_BOI_TOKEN_ID; pub(crate) use rmlx_models::gemma4::EOI_TOKEN_ID as GEMMA4_EOI_TOKEN_ID; +/// Gemma4 user-turn opener `<|turn>user\n` (token ids `<|turn>`=105, `user`=2364, +/// `\n`=107). The image block splices in right after this so the image lands +/// *inside* the user turn it belongs to — matching the HF/mlx-vlm processor, +/// which substitutes the `<|image|>` placeholder in place within the user +/// message rather than before the turn. +pub(crate) const GEMMA4_USER_TURN_OPENER: [u32; 3] = [105, 2364, 107]; + +/// Splice per-image token blocks (`` + N×image-token + ``) into the +/// prompt so the image conditions the turn it belongs to. +/// +/// The image content item is part of the user message, so the block must land +/// *inside* the user turn — right after `turn_opener` (the per-arch user-turn +/// opener token sequence, e.g. [`GEMMA4_USER_TURN_OPENER`]). This mirrors the +/// HF / mlx-vlm processor, which substitutes the `<|image|>` placeholder in +/// place. Inserting the block *before* the user turn (after BOS) leaves the +/// image outside the question and makes the model frequently fail to ground on +/// it — image-independent, repetitive output. +/// +/// The **last** opener match is used so a multi-turn prompt splices into the +/// final user turn (the one the image accompanies), not an earlier one. When +/// `turn_opener` is empty or not found (non-standard template), fall back to +/// inserting after the prompt's leading BOS token. +#[allow( + clippy::indexing_slicing, + reason = "insert_at is bounded by prompt_tokens.len(): it is either 0/1 (after-BOS) \ + or a window position plus the matched opener length, all within bounds. \ + The early-return to after_bos when turn_opener.is_empty() prevents windows(0) \ + (which would panic); the turn_opener.len() > prompt_tokens.len() guard prevents \ + the oversized-opener case where windows() would yield no results." +)] +pub(crate) fn splice_image_block( + prompt_tokens: &[u32], + blocks: &[Vec], + turn_opener: &[u32], +) -> Vec { + let total: usize = blocks.iter().map(Vec::len).sum(); + let after_bos = usize::from(!prompt_tokens.is_empty()); + let insert_at = if turn_opener.is_empty() || turn_opener.len() > prompt_tokens.len() { + after_bos + } else { + prompt_tokens + .windows(turn_opener.len()) + .enumerate() + .rfind(|(_, w)| *w == turn_opener) + .map_or(after_bos, |(pos, _)| pos + turn_opener.len()) + }; + let mut aug = Vec::with_capacity(prompt_tokens.len() + total); + aug.extend_from_slice(&prompt_tokens[..insert_at]); + for b in blocks { + aug.extend_from_slice(b); + } + aug.extend_from_slice(&prompt_tokens[insert_at..]); + aug +} + /// bundle of the multimodal components needed to turn image bytes /// into scattered `inputs_embeds`. Loaded once per model. One variant per /// vision-capable architecture (Gemma4's custom SigLIP, Gemma3's standard SigLIP). @@ -70,25 +125,6 @@ pub(crate) fn build_image_prompt( mm_cache: Option<&rmlx_models::multimodal_cache::MultimodalCache>, model_sig: u64, ) -> rmlx_core::Result<(Vec, rmlx_mlx::Array, rmlx_mlx::Array)> { - // Splice per-image token blocks (`` + N×image-token + ``) in after - // the prompt's leading BOS token so the image conditions the whole turn - // causally. Shared between archs; only the token ids + soft counts differ. - #[allow( - clippy::indexing_slicing, - reason = "bounds established by construction: buffer sized at init, loop indices bounded by slice length, or validated before call" - )] - fn splice(prompt_tokens: &[u32], blocks: &[Vec]) -> Vec { - let total: usize = blocks.iter().map(Vec::len).sum(); - let insert_at = usize::from(!prompt_tokens.is_empty()); - let mut aug = Vec::with_capacity(prompt_tokens.len() + total); - aug.extend_from_slice(&prompt_tokens[..insert_at]); - for b in blocks { - aug.extend_from_slice(b); - } - aug.extend_from_slice(&prompt_tokens[insert_at..]); - aug - } - match vb { VisionBundle::Gemma4 { vision, @@ -126,7 +162,7 @@ pub(crate) fn build_image_prompt( b }) .collect(); - let aug_ids = splice(prompt_tokens, &blocks); + let aug_ids = splice_image_block(prompt_tokens, &blocks, &GEMMA4_USER_TURN_OPENER); let total_soft: usize = pixels.iter().map(|p| p.num_soft_tokens).sum(); let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count(); @@ -185,7 +221,7 @@ pub(crate) fn build_image_prompt( b }) .collect(); - let aug_ids = splice(prompt_tokens, &blocks); + let aug_ids = splice_image_block(prompt_tokens, &blocks, &GEMMA4_USER_TURN_OPENER); let total_soft: usize = pixels.iter().map(|(_, n)| *n).sum(); let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count(); @@ -240,7 +276,9 @@ pub(crate) fn build_image_prompt( b }) .collect(); - let aug_ids = splice(prompt_tokens, &blocks); + // Gemma3's user-turn opener ids are not verified against a snapshot + // here, so keep the after-BOS fallback (empty opener) for it. + let aug_ids = splice_image_block(prompt_tokens, &blocks, &[]); let total_soft: usize = pixels.iter().map(|p| p.num_soft_tokens).sum(); let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count(); @@ -274,6 +312,11 @@ pub(crate) const QWEN3VL_VISION_END_ID: u32 = 151_653; /// Qwen3-VL-MoE `<|image_pad|>` token id (the scatter target). pub(crate) const QWEN3VL_IMAGE_PAD_ID: u32 = 151_655; +/// Qwen3-VL-MoE user-turn opener `<|im_start|>user\n` (token ids 151644, 872, 198). +/// The vision block splices in right after this, matching the mlx-vlm prompt format +/// `<|im_start|>user\n<|vision_start|>...<|vision_end|><|im_end|>`. +pub(crate) const QWEN3VL_USER_TURN_OPENER: [u32; 3] = [151_644, 872, 198]; + /// full Qwen3-VL-MoE image request: preprocess images, run the ViT, /// splice the per-image vision block (`<|vision_start|>` + N×`<|image_pad|>` + /// `<|vision_end|>`) after the prompt's leading token, then decode via the @@ -451,11 +494,11 @@ pub(crate) fn run_qwen3vl_image( // Splice the vision block `<|vision_start|>` + N×`<|image_pad|>` + // `<|vision_end|>` immediately after the user-turn opener `<|im_start|>user\n` - // (token ids 151644, 872, 198) and before the user text — matching the - // mlx-vlm Qwen3-VL prompt format - // `<|im_start|>user\n<|vision_start|>...<|vision_end|><|im_end|>`. If - // that opener is not found (non-standard template), fall back to inserting - // after the leading token. + // — matching the mlx-vlm Qwen3-VL prompt format + // `<|im_start|>user\n<|vision_start|>...<|vision_end|><|im_end|>`. The + // last opener match is used so a multi-turn prompt splices into the final user + // turn. If the opener is not found (non-standard template), falls back to + // inserting after the leading token. let mut block = Vec::with_capacity(pv.num_soft_tokens + 2); block.push(QWEN3VL_VISION_START_ID); block.extend(std::iter::repeat_n( @@ -464,17 +507,7 @@ pub(crate) fn run_qwen3vl_image( )); block.push(QWEN3VL_VISION_END_ID); - const IM_START: u32 = 151_644; - const USER: u32 = 872; - const NL: u32 = 198; - let insert_at = prompt_tokens - .windows(3) - .position(|w| w == [IM_START, USER, NL]) - .map_or(usize::from(!prompt_tokens.is_empty()), |p| p + 3); - let mut aug_ids = Vec::with_capacity(prompt_tokens.len() + block.len()); - aug_ids.extend_from_slice(&prompt_tokens[..insert_at]); - aug_ids.extend_from_slice(&block); - aug_ids.extend_from_slice(&prompt_tokens[insert_at..]); + let aug_ids = splice_image_block(prompt_tokens, &[block], &QWEN3VL_USER_TURN_OPENER); let in_prompt = aug_ids .iter() @@ -514,3 +547,7 @@ pub(crate) fn rmlx_server_load_image(source: &str) -> rmlx_core::Result> crate::image_io::load_image(source, std::time::Duration::from_secs(20)) .map_err(|e| Error::Other(format!("image load failed: {e}"))) } + +#[cfg(test)] +#[path = "image_tests.rs"] +mod image_tests; diff --git a/crates/rmlx-server/src/engine/image_tests.rs b/crates/rmlx-server/src/engine/image_tests.rs new file mode 100644 index 0000000..f567f99 --- /dev/null +++ b/crates/rmlx-server/src/engine/image_tests.rs @@ -0,0 +1,174 @@ +//! Image-prompt splice-placement tests. +//! +//! These cover [`splice_image_block`] — the token-level placement of the +//! per-image ` + N×image + ` block. The end-to-end vision decode is +//! exercised by the real-model integration path; here we assert the block lands +//! *inside* the user turn (the bug this logic fixes was inserting it before the +//! turn, leaving the image outside the question). +#![allow( + clippy::indexing_slicing, + reason = "test fixtures: index/slice bounds are fixed by the literal prompt arrays constructed in each test" +)] + +use super::{splice_image_block, GEMMA4_USER_TURN_OPENER, QWEN3VL_USER_TURN_OPENER}; + +/// A minimal image block placeholder (real blocks are ` + N×img + `; +/// the placement logic is independent of the block contents). +fn block(marker: u32, n: usize) -> Vec { + let mut b = vec![900]; + b.extend(std::iter::repeat_n(marker, n)); + b.push(901); + b +} + +#[test] +fn gemma4_block_inserts_inside_user_turn() { + // <|turn> user \n "hi" \n + let bos = 2u32; + let opener = GEMMA4_USER_TURN_OPENER; // [105, 2364, 107] + let prompt = [bos, opener[0], opener[1], opener[2], 555, 106, 107]; + let blk = block(258_880, 3); + let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &opener); + + // The block must appear AFTER the opener and BEFORE the user text token 555. + let opener_end = 4; // bos + 3 opener tokens + assert_eq!( + &aug[..opener_end], + &prompt[..opener_end], + "prefix preserved" + ); + assert_eq!( + &aug[opener_end..opener_end + blk.len()], + blk.as_slice(), + "block spliced right after the user-turn opener" + ); + assert_eq!( + aug[opener_end + blk.len()], + 555, + "user text follows the block (image is inside the turn, before the text)" + ); +} + +#[test] +fn gemma4_block_uses_last_user_turn_in_multi_turn() { + let bos = 2u32; + let o = GEMMA4_USER_TURN_OPENER; + // two user turns; image accompanies the second (last) one. + let prompt = [ + bos, o[0], o[1], o[2], 10, 106, 107, // turn 1: user "10" + o[0], o[1], o[2], 20, 106, 107, // turn 2: user "20" + ]; + let blk = block(258_880, 2); + let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &o); + + // Insert position is after the SECOND opener (index 7 + 3 = 10). + let second_opener_end = 10; + assert_eq!( + &aug[second_opener_end..second_opener_end + blk.len()], + blk.as_slice(), + "block spliced into the final user turn, not the first" + ); + assert_eq!( + aug[second_opener_end + blk.len()], + 20, + "second turn text follows" + ); + // The first turn must be untouched (no block before token 10). + assert_eq!(&aug[..second_opener_end], &prompt[..second_opener_end]); +} + +#[test] +fn falls_back_to_after_bos_when_opener_absent() { + let bos = 2u32; + let prompt = [bos, 500, 501, 502]; + let blk = block(258_880, 2); + // Empty opener forces the after-BOS fallback (e.g. the Gemma3 path). + let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &[]); + assert_eq!(aug[0], bos, "BOS stays first"); + assert_eq!( + &aug[1..=blk.len()], + blk.as_slice(), + "block spliced right after BOS" + ); + assert_eq!( + &aug[1 + blk.len()..], + &prompt[1..], + "rest of prompt preserved" + ); + + // A non-empty opener that is not present also falls back to after-BOS. + let aug2 = splice_image_block( + &prompt, + std::slice::from_ref(&blk), + &GEMMA4_USER_TURN_OPENER, + ); + assert_eq!( + &aug2[1..=blk.len()], + blk.as_slice(), + "opener absent → after-BOS" + ); +} + +#[test] +fn empty_prompt_inserts_block_at_front() { + let blk = block(258_880, 1); + let aug = splice_image_block(&[], std::slice::from_ref(&blk), &GEMMA4_USER_TURN_OPENER); + assert_eq!(aug, blk, "empty prompt → block is the whole sequence"); +} + +#[test] +fn multi_image_blocks_appear_in_order_and_contiguous() { + // Two blocks passed together must land in-order and contiguous at the splice + // point — no interleaving, no reversal. + let bos = 2u32; + let o = GEMMA4_USER_TURN_OPENER; + let prompt = [bos, o[0], o[1], o[2], 555, 106, 107]; + let blk1 = block(111, 2); // first image block + let blk2 = block(222, 3); // second image block + let aug = splice_image_block(&prompt, &[blk1.clone(), blk2.clone()], &o); + + let opener_end = 4usize; // bos + 3 opener tokens + // Both blocks appear right after the opener, blk1 first, blk2 immediately after. + assert_eq!( + &aug[opener_end..opener_end + blk1.len()], + blk1.as_slice(), + "first block lands first" + ); + assert_eq!( + &aug[opener_end + blk1.len()..opener_end + blk1.len() + blk2.len()], + blk2.as_slice(), + "second block immediately follows first" + ); + // User text token 555 follows both blocks without any prompt tokens in between. + assert_eq!( + aug[opener_end + blk1.len() + blk2.len()], + 555, + "user text follows both blocks" + ); + // Total length is consistent. + assert_eq!( + aug.len(), + prompt.len() + blk1.len() + blk2.len(), + "length adds up" + ); +} + +#[test] +fn qwen3vl_single_turn_splice_matches_last_opener() { + // Verify that routing Qwen3-VL through splice_image_block (last-match) + // produces the same insert position as the former first-match code for a + // single-turn prompt (first == last when there is only one opener). + let bos = 1u32; + let o = QWEN3VL_USER_TURN_OPENER; // [151644, 872, 198] + let prompt = [bos, o[0], o[1], o[2], 999, 151_645]; + let blk = block(151_655, 4); + let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &o); + + let opener_end = 4usize; // bos + 3 opener tokens + assert_eq!( + &aug[opener_end..opener_end + blk.len()], + blk.as_slice(), + "vision block lands right after the user-turn opener" + ); + assert_eq!(aug[opener_end + blk.len()], 999, "user text follows"); +} diff --git a/docs/MODELS.md b/docs/MODELS.md index ca57906..c601eb2 100644 --- a/docs/MODELS.md +++ b/docs/MODELS.md @@ -697,6 +697,18 @@ are f32) — adopt / restore the operand dtype, mirroring mlx-lm's weak-typed Python floats. A unit-level dtype-lock test guards this against regression; see docs/KV_QUANT.md "Gemma4 global `--kv-quant none` KV is bf16". +**Image-prompt placement (inside the user turn).** The per-image block +(`` + N × `` + ``) is spliced **inside** the user +turn — immediately after the user-turn opener `<|turn>user\n` (token ids +`[105, 2364, 107]`) — matching the HF/mlx-vlm processor, which substitutes the +`<|image|>` placeholder in place within the user message. Placing the block +before the turn (after BOS) leaves the image outside the question and makes the +model frequently fail to ground on it (image-independent, repetitive output — +e.g. "you have not provided an image"); the effect is prompt-phrasing-dependent +and hits low-precision QAT-fp4 vision snapshots hardest. The last user-turn +opener is used so multi-turn prompts splice into the final user turn. When the +opener is absent (non-standard template) the splice falls back to after-BOS. + **Conformer audio encoder + native audio input.** Present in e4b and above. The SSCP subsampling (two conv stages reducing the time dimension by 4×), Macaron-style FFW blocks with `residual_weight=0.5`, chunked local attention, and