-
Notifications
You must be signed in to change notification settings - Fork 51
Open
Description
I tried this example: https://github.com/Blaizzy/mlx-vlm/blob/main/examples/text_extraction.ipynb
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
Cell In[11], line 2
1 # Generate text
----> 2 qwen_vl_output = generate(
3 qwen_vl_model,
4 qwen_vl_processor,
5 image,
6 prompt,
7 max_tokens=1000,
8 temperature=0.7,
9 )
File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/utils.py:1076, in generate(model, processor, image, prompt, image_processor, temp, max_tokens, verbose, formatter, repetition_penalty, repetition_context_size, top_p, **kwargs)
1063 # Generate tokens
1064 generator = generate_step(
1065 input_ids,
1066 model,
(...)
1073 **kwargs,
1074 )
-> 1076 for (token, prob), n in zip(generator, range(max_tokens)):
1078 if n == 0:
1079 prompt_time = time.perf_counter() - tic
File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/utils.py:915, in generate_step(input_ids, model, pixel_values, mask, temp, repetition_penalty, repetition_context_size, top_p, logit_bias, **kwargs)
912 repetition_context = repetition_context[-repetition_context_size:]
913 return y, logprobs.squeeze(0)
--> 915 outputs = model(input_ids, pixel_values, cache=cache, mask=mask, **kwargs)
916 if outputs.cross_attention_states is not None:
917 kwargs = {
918 k: v
919 for k, v in zip(
920 ["cross_attention_states"], [outputs.cross_attention_states]
921 )
922 }
File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/models/qwen2_vl/qwen2_vl.py:101, in Model.__call__(self, input_ids, pixel_values, mask, cache, **kwargs)
99 image_grid_thw = kwargs.pop("image_grid_thw", None)
100 image_grid_thw = mx.array(image_grid_thw)
--> 101 input_embddings = self.get_input_embeddings(
102 input_ids, pixel_values, image_grid_thw
103 )
105 logits = self.language_model(None, cache=cache, inputs_embeds=input_embddings)
106 return logits
File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/models/qwen2_vl/qwen2_vl.py:67, in Model.get_input_embeddings(self, input_ids, pixel_values, image_grid_thw)
64 hidden_states = hidden_states[None, :, :]
66 # Insert special image tokens in the input_ids
---> 67 final_inputs_embeds = self._merge_input_ids_with_image_features(
68 hidden_states, inputs_embeds, input_ids
69 )
70 return final_inputs_embeds
File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/models/qwen2_vl/qwen2_vl.py:82, in Model._merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids)
80 image_features = image_features.astype(mx.float32)
81 pad_size = inputs_embeds.shape[1] - image_features.shape[1]
---> 82 image_features = mx.pad(image_features, ((0, 0), (0, pad_size), (0, 0)))
83 inputs_embeds = mx.where(
84 image_positions[:, :, None], image_features, inputs_embeds
85 )
87 # TODO: Add video features
ValueError: Invalid high padding size (-1776) passed to pad for axis 1. Padding sizes must be non-negative
Reactions are currently unavailable
Metadata
Metadata
Assignees
Labels
No labels