Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
113 changes: 75 additions & 38 deletions crates/rmlx-server/src/engine/image.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,61 @@ use rmlx_core::Error;
pub(crate) use rmlx_models::gemma4::BOI_TOKEN_ID as GEMMA4_BOI_TOKEN_ID;
pub(crate) use rmlx_models::gemma4::EOI_TOKEN_ID as GEMMA4_EOI_TOKEN_ID;

/// Gemma4 user-turn opener `<|turn>user\n` (token ids `<|turn>`=105, `user`=2364,
/// `\n`=107). The image block splices in right after this so the image lands
/// *inside* the user turn it belongs to — matching the HF/mlx-vlm processor,
/// which substitutes the `<|image|>` placeholder in place within the user
/// message rather than before the turn.
pub(crate) const GEMMA4_USER_TURN_OPENER: [u32; 3] = [105, 2364, 107];

/// Splice per-image token blocks (`<boi>` + N×image-token + `<eoi>`) into the
/// prompt so the image conditions the turn it belongs to.
///
/// The image content item is part of the user message, so the block must land
/// *inside* the user turn — right after `turn_opener` (the per-arch user-turn
/// opener token sequence, e.g. [`GEMMA4_USER_TURN_OPENER`]). This mirrors the
/// HF / mlx-vlm processor, which substitutes the `<|image|>` placeholder in
/// place. Inserting the block *before* the user turn (after BOS) leaves the
/// image outside the question and makes the model frequently fail to ground on
/// it — image-independent, repetitive output.
///
/// The **last** opener match is used so a multi-turn prompt splices into the
/// final user turn (the one the image accompanies), not an earlier one. When
/// `turn_opener` is empty or not found (non-standard template), fall back to
/// inserting after the prompt's leading BOS token.
#[allow(
clippy::indexing_slicing,
reason = "insert_at is bounded by prompt_tokens.len(): it is either 0/1 (after-BOS) \
or a window position plus the matched opener length, all within bounds. \
The early-return to after_bos when turn_opener.is_empty() prevents windows(0) \
(which would panic); the turn_opener.len() > prompt_tokens.len() guard prevents \
the oversized-opener case where windows() would yield no results."
)]
pub(crate) fn splice_image_block(
prompt_tokens: &[u32],
blocks: &[Vec<u32>],
turn_opener: &[u32],
) -> Vec<u32> {
let total: usize = blocks.iter().map(Vec::len).sum();
let after_bos = usize::from(!prompt_tokens.is_empty());
let insert_at = if turn_opener.is_empty() || turn_opener.len() > prompt_tokens.len() {
after_bos
} else {
prompt_tokens
.windows(turn_opener.len())
.enumerate()
.rfind(|(_, w)| *w == turn_opener)
.map_or(after_bos, |(pos, _)| pos + turn_opener.len())
};
let mut aug = Vec::with_capacity(prompt_tokens.len() + total);
aug.extend_from_slice(&prompt_tokens[..insert_at]);
for b in blocks {
aug.extend_from_slice(b);
}
aug.extend_from_slice(&prompt_tokens[insert_at..]);
aug
}

/// bundle of the multimodal components needed to turn image bytes
/// into scattered `inputs_embeds`. Loaded once per model. One variant per
/// vision-capable architecture (Gemma4's custom SigLIP, Gemma3's standard SigLIP).
Expand Down Expand Up @@ -70,25 +125,6 @@ pub(crate) fn build_image_prompt(
mm_cache: Option<&rmlx_models::multimodal_cache::MultimodalCache>,
model_sig: u64,
) -> rmlx_core::Result<(Vec<u32>, rmlx_mlx::Array, rmlx_mlx::Array)> {
// Splice per-image token blocks (`<boi>` + N×image-token + `<eoi>`) in after
// the prompt's leading BOS token so the image conditions the whole turn
// causally. Shared between archs; only the token ids + soft counts differ.
#[allow(
clippy::indexing_slicing,
reason = "bounds established by construction: buffer sized at init, loop indices bounded by slice length, or validated before call"
)]
fn splice(prompt_tokens: &[u32], blocks: &[Vec<u32>]) -> Vec<u32> {
let total: usize = blocks.iter().map(Vec::len).sum();
let insert_at = usize::from(!prompt_tokens.is_empty());
let mut aug = Vec::with_capacity(prompt_tokens.len() + total);
aug.extend_from_slice(&prompt_tokens[..insert_at]);
for b in blocks {
aug.extend_from_slice(b);
}
aug.extend_from_slice(&prompt_tokens[insert_at..]);
aug
}

match vb {
VisionBundle::Gemma4 {
vision,
Expand Down Expand Up @@ -126,7 +162,7 @@ pub(crate) fn build_image_prompt(
b
})
.collect();
let aug_ids = splice(prompt_tokens, &blocks);
let aug_ids = splice_image_block(prompt_tokens, &blocks, &GEMMA4_USER_TURN_OPENER);

let total_soft: usize = pixels.iter().map(|p| p.num_soft_tokens).sum();
let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count();
Expand Down Expand Up @@ -185,7 +221,7 @@ pub(crate) fn build_image_prompt(
b
})
.collect();
let aug_ids = splice(prompt_tokens, &blocks);
let aug_ids = splice_image_block(prompt_tokens, &blocks, &GEMMA4_USER_TURN_OPENER);

let total_soft: usize = pixels.iter().map(|(_, n)| *n).sum();
let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count();
Expand Down Expand Up @@ -240,7 +276,9 @@ pub(crate) fn build_image_prompt(
b
})
.collect();
let aug_ids = splice(prompt_tokens, &blocks);
// Gemma3's user-turn opener ids are not verified against a snapshot
// here, so keep the after-BOS fallback (empty opener) for it.
let aug_ids = splice_image_block(prompt_tokens, &blocks, &[]);

let total_soft: usize = pixels.iter().map(|p| p.num_soft_tokens).sum();
let in_prompt = aug_ids.iter().filter(|&&t| t == image_token_id).count();
Expand Down Expand Up @@ -274,6 +312,11 @@ pub(crate) const QWEN3VL_VISION_END_ID: u32 = 151_653;
/// Qwen3-VL-MoE `<|image_pad|>` token id (the scatter target).
pub(crate) const QWEN3VL_IMAGE_PAD_ID: u32 = 151_655;

/// Qwen3-VL-MoE user-turn opener `<|im_start|>user\n` (token ids 151644, 872, 198).
/// The vision block splices in right after this, matching the mlx-vlm prompt format
/// `<|im_start|>user\n<|vision_start|>...<|vision_end|><text><|im_end|>`.
pub(crate) const QWEN3VL_USER_TURN_OPENER: [u32; 3] = [151_644, 872, 198];

/// full Qwen3-VL-MoE image request: preprocess images, run the ViT,
/// splice the per-image vision block (`<|vision_start|>` + N×`<|image_pad|>` +
/// `<|vision_end|>`) after the prompt's leading token, then decode via the
Expand Down Expand Up @@ -451,11 +494,11 @@ pub(crate) fn run_qwen3vl_image(

// Splice the vision block `<|vision_start|>` + N×`<|image_pad|>` +
// `<|vision_end|>` immediately after the user-turn opener `<|im_start|>user\n`
// (token ids 151644, 872, 198) and before the user text — matching the
// mlx-vlm Qwen3-VL prompt format
// `<|im_start|>user\n<|vision_start|>...<|vision_end|><text><|im_end|>`. If
// that opener is not found (non-standard template), fall back to inserting
// after the leading token.
// — matching the mlx-vlm Qwen3-VL prompt format
// `<|im_start|>user\n<|vision_start|>...<|vision_end|><text><|im_end|>`. The
// last opener match is used so a multi-turn prompt splices into the final user
// turn. If the opener is not found (non-standard template), falls back to
// inserting after the leading token.
let mut block = Vec::with_capacity(pv.num_soft_tokens + 2);
block.push(QWEN3VL_VISION_START_ID);
block.extend(std::iter::repeat_n(
Expand All @@ -464,17 +507,7 @@ pub(crate) fn run_qwen3vl_image(
));
block.push(QWEN3VL_VISION_END_ID);

const IM_START: u32 = 151_644;
const USER: u32 = 872;
const NL: u32 = 198;
let insert_at = prompt_tokens
.windows(3)
.position(|w| w == [IM_START, USER, NL])
.map_or(usize::from(!prompt_tokens.is_empty()), |p| p + 3);
let mut aug_ids = Vec::with_capacity(prompt_tokens.len() + block.len());
aug_ids.extend_from_slice(&prompt_tokens[..insert_at]);
aug_ids.extend_from_slice(&block);
aug_ids.extend_from_slice(&prompt_tokens[insert_at..]);
let aug_ids = splice_image_block(prompt_tokens, &[block], &QWEN3VL_USER_TURN_OPENER);

let in_prompt = aug_ids
.iter()
Expand Down Expand Up @@ -514,3 +547,7 @@ pub(crate) fn rmlx_server_load_image(source: &str) -> rmlx_core::Result<Vec<u8>>
crate::image_io::load_image(source, std::time::Duration::from_secs(20))
.map_err(|e| Error::Other(format!("image load failed: {e}")))
}

#[cfg(test)]
#[path = "image_tests.rs"]
mod image_tests;
174 changes: 174 additions & 0 deletions crates/rmlx-server/src/engine/image_tests.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
//! Image-prompt splice-placement tests.
//!
//! These cover [`splice_image_block`] — the token-level placement of the
//! per-image `<boi> + N×image + <eoi>` block. The end-to-end vision decode is
//! exercised by the real-model integration path; here we assert the block lands
//! *inside* the user turn (the bug this logic fixes was inserting it before the
//! turn, leaving the image outside the question).
#![allow(
clippy::indexing_slicing,
reason = "test fixtures: index/slice bounds are fixed by the literal prompt arrays constructed in each test"
)]

use super::{splice_image_block, GEMMA4_USER_TURN_OPENER, QWEN3VL_USER_TURN_OPENER};

/// A minimal image block placeholder (real blocks are `<boi> + N×img + <eoi>`;
/// the placement logic is independent of the block contents).
fn block(marker: u32, n: usize) -> Vec<u32> {
let mut b = vec![900];
b.extend(std::iter::repeat_n(marker, n));
b.push(901);
b
}

#[test]
fn gemma4_block_inserts_inside_user_turn() {
// <bos> <|turn> user \n "hi" <turn|> \n
let bos = 2u32;
let opener = GEMMA4_USER_TURN_OPENER; // [105, 2364, 107]
let prompt = [bos, opener[0], opener[1], opener[2], 555, 106, 107];
let blk = block(258_880, 3);
let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &opener);

// The block must appear AFTER the opener and BEFORE the user text token 555.
let opener_end = 4; // bos + 3 opener tokens
assert_eq!(
&aug[..opener_end],
&prompt[..opener_end],
"prefix preserved"
);
assert_eq!(
&aug[opener_end..opener_end + blk.len()],
blk.as_slice(),
"block spliced right after the user-turn opener"
);
assert_eq!(
aug[opener_end + blk.len()],
555,
"user text follows the block (image is inside the turn, before the text)"
);
}

#[test]
fn gemma4_block_uses_last_user_turn_in_multi_turn() {
let bos = 2u32;
let o = GEMMA4_USER_TURN_OPENER;
// two user turns; image accompanies the second (last) one.
let prompt = [
bos, o[0], o[1], o[2], 10, 106, 107, // turn 1: user "10"
o[0], o[1], o[2], 20, 106, 107, // turn 2: user "20"
];
let blk = block(258_880, 2);
let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &o);

// Insert position is after the SECOND opener (index 7 + 3 = 10).
let second_opener_end = 10;
assert_eq!(
&aug[second_opener_end..second_opener_end + blk.len()],
blk.as_slice(),
"block spliced into the final user turn, not the first"
);
assert_eq!(
aug[second_opener_end + blk.len()],
20,
"second turn text follows"
);
// The first turn must be untouched (no block before token 10).
assert_eq!(&aug[..second_opener_end], &prompt[..second_opener_end]);
}

#[test]
fn falls_back_to_after_bos_when_opener_absent() {
let bos = 2u32;
let prompt = [bos, 500, 501, 502];
let blk = block(258_880, 2);
// Empty opener forces the after-BOS fallback (e.g. the Gemma3 path).
let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &[]);
assert_eq!(aug[0], bos, "BOS stays first");
assert_eq!(
&aug[1..=blk.len()],
blk.as_slice(),
"block spliced right after BOS"
);
assert_eq!(
&aug[1 + blk.len()..],
&prompt[1..],
"rest of prompt preserved"
);

// A non-empty opener that is not present also falls back to after-BOS.
let aug2 = splice_image_block(
&prompt,
std::slice::from_ref(&blk),
&GEMMA4_USER_TURN_OPENER,
);
assert_eq!(
&aug2[1..=blk.len()],
blk.as_slice(),
"opener absent → after-BOS"
);
}

#[test]
fn empty_prompt_inserts_block_at_front() {
let blk = block(258_880, 1);
let aug = splice_image_block(&[], std::slice::from_ref(&blk), &GEMMA4_USER_TURN_OPENER);
assert_eq!(aug, blk, "empty prompt → block is the whole sequence");
}

#[test]
fn multi_image_blocks_appear_in_order_and_contiguous() {
// Two blocks passed together must land in-order and contiguous at the splice
// point — no interleaving, no reversal.
let bos = 2u32;
let o = GEMMA4_USER_TURN_OPENER;
let prompt = [bos, o[0], o[1], o[2], 555, 106, 107];
let blk1 = block(111, 2); // first image block
let blk2 = block(222, 3); // second image block
let aug = splice_image_block(&prompt, &[blk1.clone(), blk2.clone()], &o);

let opener_end = 4usize; // bos + 3 opener tokens
// Both blocks appear right after the opener, blk1 first, blk2 immediately after.
assert_eq!(
&aug[opener_end..opener_end + blk1.len()],
blk1.as_slice(),
"first block lands first"
);
assert_eq!(
&aug[opener_end + blk1.len()..opener_end + blk1.len() + blk2.len()],
blk2.as_slice(),
"second block immediately follows first"
);
// User text token 555 follows both blocks without any prompt tokens in between.
assert_eq!(
aug[opener_end + blk1.len() + blk2.len()],
555,
"user text follows both blocks"
);
// Total length is consistent.
assert_eq!(
aug.len(),
prompt.len() + blk1.len() + blk2.len(),
"length adds up"
);
}

#[test]
fn qwen3vl_single_turn_splice_matches_last_opener() {
// Verify that routing Qwen3-VL through splice_image_block (last-match)
// produces the same insert position as the former first-match code for a
// single-turn prompt (first == last when there is only one opener).
let bos = 1u32;
let o = QWEN3VL_USER_TURN_OPENER; // [151644, 872, 198]
let prompt = [bos, o[0], o[1], o[2], 999, 151_645];
let blk = block(151_655, 4);
let aug = splice_image_block(&prompt, std::slice::from_ref(&blk), &o);

let opener_end = 4usize; // bos + 3 opener tokens
assert_eq!(
&aug[opener_end..opener_end + blk.len()],
blk.as_slice(),
"vision block lands right after the user-turn opener"
);
assert_eq!(aug[opener_end + blk.len()], 999, "user text follows");
}
12 changes: 12 additions & 0 deletions docs/MODELS.md
Original file line number Diff line number Diff line change
Expand Up @@ -697,6 +697,18 @@ are f32) — adopt / restore the operand dtype, mirroring mlx-lm's weak-typed
Python floats. A unit-level dtype-lock test guards this against regression; see
docs/KV_QUANT.md "Gemma4 global `--kv-quant none` KV is bf16".

**Image-prompt placement (inside the user turn).** The per-image block
(`<boi>` + N × `<image_soft_token>` + `<eoi>`) is spliced **inside** the user
turn — immediately after the user-turn opener `<|turn>user\n` (token ids
`[105, 2364, 107]`) — matching the HF/mlx-vlm processor, which substitutes the
`<|image|>` placeholder in place within the user message. Placing the block
before the turn (after BOS) leaves the image outside the question and makes the
model frequently fail to ground on it (image-independent, repetitive output —
e.g. "you have not provided an image"); the effect is prompt-phrasing-dependent
and hits low-precision QAT-fp4 vision snapshots hardest. The last user-turn
opener is used so multi-turn prompts splice into the final user turn. When the
opener is absent (non-standard template) the splice falls back to after-BOS.

**Conformer audio encoder + native audio input.** Present in e4b and above. The
SSCP subsampling (two conv stages reducing the time dimension by 4×),
Macaron-style FFW blocks with `residual_weight=0.5`, chunked local attention, and
Expand Down
Loading