FEAT: Support deepseek vl (#1175)

Co-authored-by: qinxuye <qinxuye@gmail.com>
xorbitsai · Mar 29, 2024 · f9392f7 · f9392f7
1 parent 833b27c
commit f9392f7
Show file tree

Hide file tree

Showing 40 changed files with 5,035 additions and 0 deletions.
diff --git a/.github/workflows/python.yaml b/.github/workflows/python.yaml
@@ -111,7 +111,10 @@ jobs:
           fi
           pip install "llama-cpp-python>=0.2.23"
           pip install transformers
+          pip install attrdict
+          pip install "timm>=0.9.16"
           pip install torch
+          pip install torchvision
           pip install accelerate
           pip install sentencepiece
           pip install transformers_stream_generator

diff --git a/doc/source/models/builtin/llm/deepseek-vl-chat.rst b/doc/source/models/builtin/llm/deepseek-vl-chat.rst
@@ -0,0 +1,45 @@
+.. _models_llm_deepseek-vl-chat:
+
+========================================
+deepseek-vl-chat
+========================================
+
+- **Context Length:** 4096
+- **Model Name:** deepseek-vl-chat
+- **Languages:** en, zh
+- **Abilities:** chat, vision
+- **Description:** DeepSeek-VL possesses general multimodal understanding capabilities, capable of processing logical diagrams, web pages, formula recognition, scientific literature, natural images, and embodied intelligence in complex scenarios.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 1_3 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 1_3
+- **Quantizations:** none
+- **Model ID:** deepseek-ai/deepseek-vl-1.3b-chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name deepseek-vl-chat --size-in-billions 1_3 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** none
+- **Model ID:** deepseek-ai/deepseek-vl-7b-chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat>`__, `ModelScope <https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat>`__
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name deepseek-vl-chat --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -96,6 +96,11 @@ The following is a list of built-in LLM in Xinference:
      - 4096
      - deepseek-coder-instruct is a model initialized from deepseek-coder-base and fine-tuned on 2B tokens of instruction data.
 
+   * - :ref:`deepseek-vl-chat <models_llm_deepseek-vl-chat>`
+     - chat, vision
+     - 4096
+     - DeepSeek-VL possesses general multimodal understanding capabilities, capable of processing logical diagrams, web pages, formula recognition, scientific literature, natural images, and embodied intelligence in complex scenarios.
+
    * - :ref:`falcon <models_llm_falcon>`
      - generate
      - 2048
@@ -378,6 +383,8 @@ The following is a list of built-in LLM in Xinference:
 
    deepseek-coder-instruct
 
+   deepseek-vl-chat
+
    falcon
 
    falcon-instruct

diff --git a/setup.cfg b/setup.cfg
@@ -99,7 +99,11 @@ all =
     auto-gptq ; sys_platform!='darwin'
     autoawq ; sys_platform!='darwin'
     optimum
+    outlines==0.0.34  # sglang errored for outlines > 0.0.34
     sglang[all] ; sys_platform=='linux'
+    attrdict  # For deepseek VL
+    timm>=0.9.16  # For deepseek VL
+    torchvision  # For deepseek VL
 intel =
     torch==2.1.0a0
     intel_extension_for_pytorch==2.1.10+xpu
@@ -120,6 +124,9 @@ transformers =
     auto-gptq
     autoawq
     optimum
+    attrdict  # For deepseek VL
+    timm>=0.9.16  # For deepseek VL
+    torchvision  # For deepseek VL
     peft
 vllm =
     vllm>=0.2.6

diff --git a/xinference/model/llm/__init__.py b/xinference/model/llm/__init__.py
@@ -54,6 +54,7 @@ def _install():
     from .pytorch.baichuan import BaichuanPytorchChatModel
     from .pytorch.chatglm import ChatglmPytorchChatModel
     from .pytorch.core import PytorchChatModel, PytorchModel
+    from .pytorch.deepseek_vl import DeepSeekVLChatModel
     from .pytorch.falcon import FalconPytorchChatModel, FalconPytorchModel
     from .pytorch.internlm2 import Internlm2PytorchChatModel
     from .pytorch.llama_2 import LlamaPytorchChatModel, LlamaPytorchModel
@@ -97,6 +98,7 @@ def _install():
             QwenVLChatModel,
             OmniLMMModel,
             YiVLChatModel,
+            DeepSeekVLChatModel,
             PytorchModel,
         ]
     )

diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -3354,6 +3354,53 @@
       "stop": []
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "deepseek-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "DeepSeek-VL possesses general multimodal understanding capabilities, capable of processing logical diagrams, web pages, formula recognition, scientific literature, natural images, and embodied intelligence in complex scenarios.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl-1.3b-chat",
+        "model_revision": "8f13a8e00dbdc381d614a9d29d61b07e8fe91b3f"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl-7b-chat",
+        "model_revision": "6f16f00805f45b5249f709ce21820122eeb43556"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "DEEPSEEK_CHAT",
+      "system_prompt": "<｜begin▁of▁sentence｜>",
+      "roles": [
+        "User",
+        "Assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<｜end▁of▁sentence｜>",
+      "stop": [
+        "<｜end▁of▁sentence｜>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -2031,6 +2031,53 @@
       ]
     }
   },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "deepseek-vl-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat",
+      "vision"
+    ],
+    "model_description": "DeepSeek-VL possesses general multimodal understanding capabilities, capable of processing logical diagrams, web pages, formula recognition, scientific literature, natural images, and embodied intelligence in complex scenarios.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": "1_3",
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl-1.3b-chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 7,
+        "quantizations": [
+          "none"
+        ],
+        "model_id": "deepseek-ai/deepseek-vl-7b-chat",
+        "model_hub": "modelscope"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "DEEPSEEK_CHAT",
+      "system_prompt": "<｜begin▁of▁sentence｜>",
+      "roles": [
+        "User",
+        "Assistant"
+      ],
+      "intra_message_sep": "\n\n",
+      "inter_message_sep": "<｜end▁of▁sentence｜>",
+      "stop": [
+        "<｜end▁of▁sentence｜>"
+      ]
+    }
+  },
   {
     "version": 1,
     "context_length": 4096,

diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
@@ -467,6 +467,7 @@ def match(
             "qwen-vl-chat",
             "OmniLMM",
             "yi-vl-chat",
+            "deepseek-vl-chat",
         ]:
             return False
         if "chat" not in llm_family.model_ability: