Skip to content

mlx-vlm #73

@mneedham

Description

@mneedham

I tried this example: https://github.com/Blaizzy/mlx-vlm/blob/main/examples/text_extraction.ipynb

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
Cell In[11], line 2
      1 # Generate text
----> 2 qwen_vl_output = generate(
      3     qwen_vl_model,
      4     qwen_vl_processor,
      5     image,
      6     prompt,
      7     max_tokens=1000,
      8     temperature=0.7,
      9 )

File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/utils.py:1076, in generate(model, processor, image, prompt, image_processor, temp, max_tokens, verbose, formatter, repetition_penalty, repetition_context_size, top_p, **kwargs)
   1063 # Generate tokens
   1064 generator = generate_step(
   1065     input_ids,
   1066     model,
   (...)
   1073     **kwargs,
   1074 )
-> 1076 for (token, prob), n in zip(generator, range(max_tokens)):
   1078     if n == 0:
   1079         prompt_time = time.perf_counter() - tic

File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/utils.py:915, in generate_step(input_ids, model, pixel_values, mask, temp, repetition_penalty, repetition_context_size, top_p, logit_bias, **kwargs)
    912             repetition_context = repetition_context[-repetition_context_size:]
    913     return y, logprobs.squeeze(0)
--> 915 outputs = model(input_ids, pixel_values, cache=cache, mask=mask, **kwargs)
    916 if outputs.cross_attention_states is not None:
    917     kwargs = {
    918         k: v
    919         for k, v in zip(
    920             ["cross_attention_states"], [outputs.cross_attention_states]
    921         )
    922     }

File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/models/qwen2_vl/qwen2_vl.py:101, in Model.__call__(self, input_ids, pixel_values, mask, cache, **kwargs)
     99 image_grid_thw = kwargs.pop("image_grid_thw", None)
    100 image_grid_thw = mx.array(image_grid_thw)
--> 101 input_embddings = self.get_input_embeddings(
    102     input_ids, pixel_values, image_grid_thw
    103 )
    105 logits = self.language_model(None, cache=cache, inputs_embeds=input_embddings)
    106 return logits

File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/models/qwen2_vl/qwen2_vl.py:67, in Model.get_input_embeddings(self, input_ids, pixel_values, image_grid_thw)
     64     hidden_states = hidden_states[None, :, :]
     66 # Insert special image tokens in the input_ids
---> 67 final_inputs_embeds = self._merge_input_ids_with_image_features(
     68     hidden_states, inputs_embeds, input_ids
     69 )
     70 return final_inputs_embeds

File ~/.cache/uv/archive-v0/D_l11-vdM1TvblTHtCbVG/lib/python3.10/site-packages/mlx_vlm/models/qwen2_vl/qwen2_vl.py:82, in Model._merge_input_ids_with_image_features(self, image_features, inputs_embeds, input_ids)
     80 image_features = image_features.astype(mx.float32)
     81 pad_size = inputs_embeds.shape[1] - image_features.shape[1]
---> 82 image_features = mx.pad(image_features, ((0, 0), (0, pad_size), (0, 0)))
     83 inputs_embeds = mx.where(
     84     image_positions[:, :, None], image_features, inputs_embeds
     85 )
     87 # TODO: Add video features

ValueError: Invalid high padding size (-1776) passed to pad for axis 1. Padding sizes must be non-negative

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions