Blaizzy · Blaizzy · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 28, 2024
diff --git a/mlx_vlm/models/idefics3/idefics3.py b/mlx_vlm/models/idefics3/idefics3.py
@@ -22,9 +22,14 @@ class ModelConfig:
     vision_config: VisionConfig
     model_type: str
     ignore_index: int = -100
-    image_token_id: int = 128257
     vocab_size: int = 128259
     scale_factor: int = 2
+    image_token_id: int = 49153
+    image_token_index: Optional[int] = None
+
+    def __post_init__(self):
+        if self.image_token_index is None:
+            self.image_token_index = self.image_token_id
 
     @classmethod
     def from_dict(cls, params):
@@ -111,7 +116,7 @@ def get_input_embeddings(
         return final_inputs_embeds
 
     def _prepare_inputs_for_multimodal(self, image_features, inputs_embeds, input_ids):
-        image_token_index = self.config.image_token_id
+        image_token_index = self.config.image_token_index
         num_images, num_image_patches, embed_dim = image_features.shape
 
         # Positions of <image> tokens in input_ids, assuming batch size is 1

diff --git a/mlx_vlm/prompt_utils.py b/mlx_vlm/prompt_utils.py
@@ -24,7 +24,7 @@ def add_image_tokens(message, token_format):
             return message
         if role == "user" and not skip_image_token:
             if isinstance(message["content"], list):
-                if model_name == "pixtral":
+                if model_name in ["pixtral", "idefics3"]:
                     message["content"] = [{"type": "image"}] * num_images + message[
                         "content"
                     ]