From 5da3fd2312542ee0e16e21ab82dbe4a238173e1f Mon Sep 17 00:00:00 2001
From: aresnow1 <109642806+aresnow1@users.noreply.github.com>
Date: Fri, 22 Dec 2023 14:21:48 +0800
Subject: [PATCH] FEAT: Support mistral instruct v0.2 (#796)
---
README.md | 4 +-
README_zh_CN.md | 4 +-
doc/source/models/builtin/llm/index.rst | 4 +-
.../builtin/llm/mistral-instruct-v0.2.rst | 43 ++++++++++++++
doc/source/models/builtin/llm/qwen-chat.rst | 26 ++++-----
xinference/model/llm/llm_family.json | 58 +++++++++++++++++++
.../model/llm/llm_family_modelscope.json | 48 +++++++++++++++
7 files changed, 169 insertions(+), 18 deletions(-)
create mode 100644 doc/source/models/builtin/llm/mistral-instruct-v0.2.rst
diff --git a/README.md b/README.md
index 7dad5122e6..3b5cd092a3 100644
--- a/README.md
+++ b/README.md
@@ -32,12 +32,12 @@ potential of cutting-edge AI models.
- Speculative decoding: [#509](https://github.com/xorbitsai/inference/pull/509)
- Incorporate vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
### New Models
+- Built-in support for [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796)
+- Built-in support for [deepseek-llm](https://huggingface.co/deepseek-ai) and [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786)
- Built-in support for [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782)
- Built-in support for [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B): [#776](https://github.com/xorbitsai/inference/pull/776)
- Built-in support for [Yi](https://huggingface.co/01-ai): [#629](https://github.com/xorbitsai/inference/pull/629)
- Built-in support for [zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta): [#597](https://github.com/xorbitsai/inference/pull/597)
-- Built-in support for [chatglm3](https://huggingface.co/THUDM/chatglm3-6b): [#587](https://github.com/xorbitsai/inference/pull/587)
-- Built-in support for [mistral-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [mistral-instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): [#510](https://github.com/xorbitsai/inference/pull/510)
### Integrations
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable.
- [Chatbox](https://chatboxai.app/): a desktop client for multiple cutting-edge LLM models, available on Windows, Mac and Linux.
diff --git a/README_zh_CN.md b/README_zh_CN.md
index d519672a86..0db3e24102 100644
--- a/README_zh_CN.md
+++ b/README_zh_CN.md
@@ -30,12 +30,12 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布
- 投机采样: [#509](https://github.com/xorbitsai/inference/pull/509)
- 引入 vLLM: [#445](https://github.com/xorbitsai/inference/pull/445)
### 新模型
+- 内置 [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796)
+- 内置 [deepseek-llm](https://huggingface.co/deepseek-ai) 与 [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786)
- 内置 [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782)
- 内置 [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B): [#776](https://github.com/xorbitsai/inference/pull/776)
- 内置 [Yi](https://huggingface.co/01-ai): [#629](https://github.com/xorbitsai/inference/pull/629)
- 内置 [zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) 与 [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta): [#597](https://github.com/xorbitsai/inference/pull/597)
-- 内置 [chatglm3](https://huggingface.co/THUDM/chatglm3-6b): [#587](https://github.com/xorbitsai/inference/pull/587)
-- 内置 [mistral-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) 与 [mistral-instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): [#510](https://github.com/xorbitsai/inference/pull/510)
### 集成
- [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。
- [Chatbox](https://chatboxai.app/): 一个支持前沿大语言模型的桌面客户端,支持 Windows,Mac,以及 Linux。
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
index 26b4837205..4f0f57c294 100644
--- a/doc/source/models/builtin/llm/index.rst
+++ b/doc/source/models/builtin/llm/index.rst
@@ -46,7 +46,7 @@ The following is a list of built-in LLM in Xinference:
glaive-coder
gorilla-openfunctions-v1
-
+
gpt-2
internlm-20b
@@ -63,6 +63,8 @@ The following is a list of built-in LLM in Xinference:
mistral-instruct-v0.1
+ mistral-instruct-v0.2
+
mistral-v0.1
mixtral-instruct-v0.1
diff --git a/doc/source/models/builtin/llm/mistral-instruct-v0.2.rst b/doc/source/models/builtin/llm/mistral-instruct-v0.2.rst
new file mode 100644
index 0000000000..cc3a856b15
--- /dev/null
+++ b/doc/source/models/builtin/llm/mistral-instruct-v0.2.rst
@@ -0,0 +1,43 @@
+.. _models_llm_mistral-instruct-v0.2:
+
+========================================
+mistral-instruct-v0.2
+========================================
+
+- **Context Length:** 8192
+- **Model Name:** mistral-instruct-v0.2
+- **Languages:** en
+- **Abilities:** chat
+- **Description:** The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 7
+- **Quantizations:** 4-bit, 8-bit, none
+- **Model ID:** mistralai/Mistral-7B-Instruct-v0.2
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-name mistral-instruct-v0.2 --size-in-billions 7 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (ggufv2, 7 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** ggufv2
+- **Model Size (in billions):** 7
+- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0
+- **Model ID:** TheBloke/Mistral-7B-Instruct-v0.2-GGUF
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+ xinference launch --model-name mistral-instruct-v0.2 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst
index 80856794e2..72ba74340f 100644
--- a/doc/source/models/builtin/llm/qwen-chat.rst
+++ b/doc/source/models/builtin/llm/qwen-chat.rst
@@ -14,32 +14,32 @@ Specifications
^^^^^^^^^^^^^^
-Model Spec 1 (ggmlv3, 7 Billion)
+Model Spec 1 (ggufv2, 7 Billion)
++++++++++++++++++++++++++++++++++++++++
-- **Model Format:** ggmlv3
+- **Model Format:** ggufv2
- **Model Size (in billions):** 7
-- **Quantizations:** q4_0
-- **Model ID:** Xorbits/qwen-chat-7B-ggml
+- **Quantizations:** Q4_K_M
+- **Model ID:** Xorbits/Qwen-7B-Chat-GGUF
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
- xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization}
+ xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization}
-Model Spec 2 (ggmlv3, 14 Billion)
+Model Spec 2 (ggufv2, 14 Billion)
++++++++++++++++++++++++++++++++++++++++
-- **Model Format:** ggmlv3
+- **Model Format:** ggufv2
- **Model Size (in billions):** 14
-- **Quantizations:** q4_0
-- **Model ID:** Xorbits/qwen-chat-14B-ggml
+- **Quantizations:** Q4_K_M
+- **Model ID:** Xorbits/Qwen-14B-Chat-GGUF
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
chosen quantization method from the options listed above::
- xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 --quantization ${quantization}
+ xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggufv2 --quantization ${quantization}
Model Spec 3 (pytorch, 1_8 Billion)
@@ -47,7 +47,7 @@ Model Spec 3 (pytorch, 1_8 Billion)
- **Model Format:** pytorch
- **Model Size (in billions):** 1_8
-- **Quantizations:** 4-bit, 8-bit, none
+- **Quantizations:** none
- **Model ID:** Qwen/Qwen-1_8B-Chat
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
@@ -61,7 +61,7 @@ Model Spec 4 (pytorch, 7 Billion)
- **Model Format:** pytorch
- **Model Size (in billions):** 7
-- **Quantizations:** 4-bit, 8-bit, none
+- **Quantizations:** none
- **Model ID:** Qwen/Qwen-7B-Chat
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
@@ -89,7 +89,7 @@ Model Spec 6 (pytorch, 72 Billion)
- **Model Format:** pytorch
- **Model Size (in billions):** 72
-- **Quantizations:** 4-bit, 8-bit, none
+- **Quantizations:** none
- **Model ID:** Qwen/Qwen-72B-Chat
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
index 1f0c67a13f..86108503c0 100644
--- a/xinference/model/llm/llm_family.json
+++ b/xinference/model/llm/llm_family.json
@@ -2062,6 +2062,64 @@
]
}
},
+ {
+ "version": 1,
+ "context_length": 8192,
+ "model_name": "mistral-instruct-v0.2",
+ "model_lang": [
+ "en"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_id": "mistralai/Mistral-7B-Instruct-v0.2",
+ "model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "Q2_K",
+ "Q3_K_S",
+ "Q3_K_M",
+ "Q3_K_L",
+ "Q4_0",
+ "Q4_K_S",
+ "Q4_K_M",
+ "Q5_0",
+ "Q5_K_S",
+ "Q5_K_M",
+ "Q6_K",
+ "Q8_0"
+ ],
+ "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
+ "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf"
+ }
+ ],
+ "prompt_style": {
+ "style_name": "NO_COLON_TWO",
+ "system_prompt": "[INST] <>\nAn informative and inspiring conversation\n<>\n\n",
+ "roles": [
+ "[INST]",
+ "[/INST]"
+ ],
+ "intra_message_sep": " ",
+ "inter_message_sep": " ",
+ "stop_token_ids": [
+ 2
+ ]
+ }
+ },
{
"version": 1,
"context_length": 8192,
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
index bc579b3151..00107de1c7 100644
--- a/xinference/model/llm/llm_family_modelscope.json
+++ b/xinference/model/llm/llm_family_modelscope.json
@@ -1209,6 +1209,54 @@
]
}
},
+ {
+ "version": 1,
+ "context_length": 8192,
+ "model_name": "mistral-instruct-v0.2",
+ "model_lang": [
+ "en"
+ ],
+ "model_ability": [
+ "chat"
+ ],
+ "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.",
+ "model_specs": [
+ {
+ "model_format": "pytorch",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "4-bit",
+ "8-bit",
+ "none"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "AI-ModelScope/Mistral-7B-Instruct-v0.2"
+ },
+ {
+ "model_format": "ggufv2",
+ "model_size_in_billions": 7,
+ "quantizations": [
+ "Q4_K_M"
+ ],
+ "model_hub": "modelscope",
+ "model_id": "Xorbits/Mistral-7B-Instruct-v0.2-GGUF",
+ "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf"
+ }
+ ],
+ "prompt_style": {
+ "style_name": "NO_COLON_TWO",
+ "system_prompt": "[INST] <>\nAn informative and inspiring conversation\n<>\n\n",
+ "roles": [
+ "[INST]",
+ "[/INST]"
+ ],
+ "intra_message_sep": " ",
+ "inter_message_sep": " ",
+ "stop_token_ids": [
+ 2
+ ]
+ }
+ },
{
"version": 1,
"context_length": 2048,