diff --git a/packages/tasks/src/tasks/image-text-to-text/data.ts b/packages/tasks/src/tasks/image-text-to-text/data.ts index 41084651d..e26de6551 100644 --- a/packages/tasks/src/tasks/image-text-to-text/data.ts +++ b/packages/tasks/src/tasks/image-text-to-text/data.ts @@ -87,7 +87,7 @@ const taskData: TaskDataCustom = { ], summary: "Image-text-to-text models take in an image and text prompt and output text. These models are also called vision-language models, or VLMs. The difference from image-to-text models is that these models take an additional text input, not restricting the model to certain use cases like image captioning, and may also be trained to accept a conversation as input.", - widgetModels: ["microsoft/kosmos-2-patch14-224"], + widgetModels: ["meta-llama/Llama-3.2-11B-Vision-Instruct"], youtubeId: "IoGaGfU1CIg", };