From 5da3fd2312542ee0e16e21ab82dbe4a238173e1f Mon Sep 17 00:00:00 2001 From: aresnow1 <109642806+aresnow1@users.noreply.github.com> Date: Fri, 22 Dec 2023 14:21:48 +0800 Subject: [PATCH] FEAT: Support mistral instruct v0.2 (#796) --- README.md | 4 +- README_zh_CN.md | 4 +- doc/source/models/builtin/llm/index.rst | 4 +- .../builtin/llm/mistral-instruct-v0.2.rst | 43 ++++++++++++++ doc/source/models/builtin/llm/qwen-chat.rst | 26 ++++----- xinference/model/llm/llm_family.json | 58 +++++++++++++++++++ .../model/llm/llm_family_modelscope.json | 48 +++++++++++++++ 7 files changed, 169 insertions(+), 18 deletions(-) create mode 100644 doc/source/models/builtin/llm/mistral-instruct-v0.2.rst diff --git a/README.md b/README.md index 7dad5122e6..3b5cd092a3 100644 --- a/README.md +++ b/README.md @@ -32,12 +32,12 @@ potential of cutting-edge AI models. - Speculative decoding: [#509](https://github.com/xorbitsai/inference/pull/509) - Incorporate vLLM: [#445](https://github.com/xorbitsai/inference/pull/445) ### New Models +- Built-in support for [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796) +- Built-in support for [deepseek-llm](https://huggingface.co/deepseek-ai) and [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786) - Built-in support for [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782) - Built-in support for [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B): [#776](https://github.com/xorbitsai/inference/pull/776) - Built-in support for [Yi](https://huggingface.co/01-ai): [#629](https://github.com/xorbitsai/inference/pull/629) - Built-in support for [zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) and [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta): [#597](https://github.com/xorbitsai/inference/pull/597) -- Built-in support for [chatglm3](https://huggingface.co/THUDM/chatglm3-6b): [#587](https://github.com/xorbitsai/inference/pull/587) -- Built-in support for [mistral-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) and [mistral-instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): [#510](https://github.com/xorbitsai/inference/pull/510) ### Integrations - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): an LLMOps platform that enables developers (and even non-developers) to quickly build useful applications based on large language models, ensuring they are visual, operable, and improvable. - [Chatbox](https://chatboxai.app/): a desktop client for multiple cutting-edge LLM models, available on Windows, Mac and Linux. diff --git a/README_zh_CN.md b/README_zh_CN.md index d519672a86..0db3e24102 100644 --- a/README_zh_CN.md +++ b/README_zh_CN.md @@ -30,12 +30,12 @@ Xorbits Inference(Xinference)是一个性能强大且功能全面的分布 - 投机采样: [#509](https://github.com/xorbitsai/inference/pull/509) - 引入 vLLM: [#445](https://github.com/xorbitsai/inference/pull/445) ### 新模型 +- 内置 [mistral-instruct-v0.2](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2): [#796](https://github.com/xorbitsai/inference/pull/796) +- 内置 [deepseek-llm](https://huggingface.co/deepseek-ai) 与 [deepseek-coder](https://huggingface.co/deepseek-ai): [#786](https://github.com/xorbitsai/inference/pull/786) - 内置 [Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1): [#782](https://github.com/xorbitsai/inference/pull/782) - 内置 [OpenHermes 2.5](https://huggingface.co/teknium/OpenHermes-2.5-Mistral-7B): [#776](https://github.com/xorbitsai/inference/pull/776) - 内置 [Yi](https://huggingface.co/01-ai): [#629](https://github.com/xorbitsai/inference/pull/629) - 内置 [zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) 与 [zephyr-7b-beta](https://huggingface.co/HuggingFaceH4/zephyr-7b-beta): [#597](https://github.com/xorbitsai/inference/pull/597) -- 内置 [chatglm3](https://huggingface.co/THUDM/chatglm3-6b): [#587](https://github.com/xorbitsai/inference/pull/587) -- 内置 [mistral-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) 与 [mistral-instruct-v0.1](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1): [#510](https://github.com/xorbitsai/inference/pull/510) ### 集成 - [Dify](https://docs.dify.ai/advanced/model-configuration/xinference): 一个涵盖了大型语言模型开发、部署、维护和优化的 LLMOps 平台。 - [Chatbox](https://chatboxai.app/): 一个支持前沿大语言模型的桌面客户端,支持 Windows,Mac,以及 Linux。 diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst index 26b4837205..4f0f57c294 100644 --- a/doc/source/models/builtin/llm/index.rst +++ b/doc/source/models/builtin/llm/index.rst @@ -46,7 +46,7 @@ The following is a list of built-in LLM in Xinference: glaive-coder gorilla-openfunctions-v1 - + gpt-2 internlm-20b @@ -63,6 +63,8 @@ The following is a list of built-in LLM in Xinference: mistral-instruct-v0.1 + mistral-instruct-v0.2 + mistral-v0.1 mixtral-instruct-v0.1 diff --git a/doc/source/models/builtin/llm/mistral-instruct-v0.2.rst b/doc/source/models/builtin/llm/mistral-instruct-v0.2.rst new file mode 100644 index 0000000000..cc3a856b15 --- /dev/null +++ b/doc/source/models/builtin/llm/mistral-instruct-v0.2.rst @@ -0,0 +1,43 @@ +.. _models_llm_mistral-instruct-v0.2: + +======================================== +mistral-instruct-v0.2 +======================================== + +- **Context Length:** 8192 +- **Model Name:** mistral-instruct-v0.2 +- **Languages:** en +- **Abilities:** chat +- **Description:** The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1. + +Specifications +^^^^^^^^^^^^^^ + + +Model Spec 1 (pytorch, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** pytorch +- **Model Size (in billions):** 7 +- **Quantizations:** 4-bit, 8-bit, none +- **Model ID:** mistralai/Mistral-7B-Instruct-v0.2 + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name mistral-instruct-v0.2 --size-in-billions 7 --model-format pytorch --quantization ${quantization} + + +Model Spec 2 (ggufv2, 7 Billion) +++++++++++++++++++++++++++++++++++++++++ + +- **Model Format:** ggufv2 +- **Model Size (in billions):** 7 +- **Quantizations:** Q2_K, Q3_K_S, Q3_K_M, Q3_K_L, Q4_0, Q4_K_S, Q4_K_M, Q5_0, Q5_K_S, Q5_K_M, Q6_K, Q8_0 +- **Model ID:** TheBloke/Mistral-7B-Instruct-v0.2-GGUF + +Execute the following command to launch the model, remember to replace ``${quantization}`` with your +chosen quantization method from the options listed above:: + + xinference launch --model-name mistral-instruct-v0.2 --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} + diff --git a/doc/source/models/builtin/llm/qwen-chat.rst b/doc/source/models/builtin/llm/qwen-chat.rst index 80856794e2..72ba74340f 100644 --- a/doc/source/models/builtin/llm/qwen-chat.rst +++ b/doc/source/models/builtin/llm/qwen-chat.rst @@ -14,32 +14,32 @@ Specifications ^^^^^^^^^^^^^^ -Model Spec 1 (ggmlv3, 7 Billion) +Model Spec 1 (ggufv2, 7 Billion) ++++++++++++++++++++++++++++++++++++++++ -- **Model Format:** ggmlv3 +- **Model Format:** ggufv2 - **Model Size (in billions):** 7 -- **Quantizations:** q4_0 -- **Model ID:** Xorbits/qwen-chat-7B-ggml +- **Quantizations:** Q4_K_M +- **Model ID:** Xorbits/Qwen-7B-Chat-GGUF Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggmlv3 --quantization ${quantization} + xinference launch --model-name qwen-chat --size-in-billions 7 --model-format ggufv2 --quantization ${quantization} -Model Spec 2 (ggmlv3, 14 Billion) +Model Spec 2 (ggufv2, 14 Billion) ++++++++++++++++++++++++++++++++++++++++ -- **Model Format:** ggmlv3 +- **Model Format:** ggufv2 - **Model Size (in billions):** 14 -- **Quantizations:** q4_0 -- **Model ID:** Xorbits/qwen-chat-14B-ggml +- **Quantizations:** Q4_K_M +- **Model ID:** Xorbits/Qwen-14B-Chat-GGUF Execute the following command to launch the model, remember to replace ``${quantization}`` with your chosen quantization method from the options listed above:: - xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggmlv3 --quantization ${quantization} + xinference launch --model-name qwen-chat --size-in-billions 14 --model-format ggufv2 --quantization ${quantization} Model Spec 3 (pytorch, 1_8 Billion) @@ -47,7 +47,7 @@ Model Spec 3 (pytorch, 1_8 Billion) - **Model Format:** pytorch - **Model Size (in billions):** 1_8 -- **Quantizations:** 4-bit, 8-bit, none +- **Quantizations:** none - **Model ID:** Qwen/Qwen-1_8B-Chat Execute the following command to launch the model, remember to replace ``${quantization}`` with your @@ -61,7 +61,7 @@ Model Spec 4 (pytorch, 7 Billion) - **Model Format:** pytorch - **Model Size (in billions):** 7 -- **Quantizations:** 4-bit, 8-bit, none +- **Quantizations:** none - **Model ID:** Qwen/Qwen-7B-Chat Execute the following command to launch the model, remember to replace ``${quantization}`` with your @@ -89,7 +89,7 @@ Model Spec 6 (pytorch, 72 Billion) - **Model Format:** pytorch - **Model Size (in billions):** 72 -- **Quantizations:** 4-bit, 8-bit, none +- **Quantizations:** none - **Model ID:** Qwen/Qwen-72B-Chat Execute the following command to launch the model, remember to replace ``${quantization}`` with your diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json index 1f0c67a13f..86108503c0 100644 --- a/xinference/model/llm/llm_family.json +++ b/xinference/model/llm/llm_family.json @@ -2062,6 +2062,64 @@ ] } }, + { + "version": 1, + "context_length": 8192, + "model_name": "mistral-instruct-v0.2", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_id": "mistralai/Mistral-7B-Instruct-v0.2", + "model_revision": "b70aa86578567ba3301b21c8a27bea4e8f6d6d61" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "Q2_K", + "Q3_K_S", + "Q3_K_M", + "Q3_K_L", + "Q4_0", + "Q4_K_S", + "Q4_K_M", + "Q5_0", + "Q5_K_S", + "Q5_K_M", + "Q6_K", + "Q8_0" + ], + "model_id": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", + "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf" + } + ], + "prompt_style": { + "style_name": "NO_COLON_TWO", + "system_prompt": "[INST] <>\nAn informative and inspiring conversation\n<>\n\n", + "roles": [ + "[INST]", + "[/INST]" + ], + "intra_message_sep": " ", + "inter_message_sep": " ", + "stop_token_ids": [ + 2 + ] + } + }, { "version": 1, "context_length": 8192, diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json index bc579b3151..00107de1c7 100644 --- a/xinference/model/llm/llm_family_modelscope.json +++ b/xinference/model/llm/llm_family_modelscope.json @@ -1209,6 +1209,54 @@ ] } }, + { + "version": 1, + "context_length": 8192, + "model_name": "mistral-instruct-v0.2", + "model_lang": [ + "en" + ], + "model_ability": [ + "chat" + ], + "model_description": "The Mistral-7B-Instruct-v0.2 Large Language Model (LLM) is an improved instruct fine-tuned version of Mistral-7B-Instruct-v0.1.", + "model_specs": [ + { + "model_format": "pytorch", + "model_size_in_billions": 7, + "quantizations": [ + "4-bit", + "8-bit", + "none" + ], + "model_hub": "modelscope", + "model_id": "AI-ModelScope/Mistral-7B-Instruct-v0.2" + }, + { + "model_format": "ggufv2", + "model_size_in_billions": 7, + "quantizations": [ + "Q4_K_M" + ], + "model_hub": "modelscope", + "model_id": "Xorbits/Mistral-7B-Instruct-v0.2-GGUF", + "model_file_name_template": "mistral-7b-instruct-v0.2.{quantization}.gguf" + } + ], + "prompt_style": { + "style_name": "NO_COLON_TWO", + "system_prompt": "[INST] <>\nAn informative and inspiring conversation\n<>\n\n", + "roles": [ + "[INST]", + "[/INST]" + ], + "intra_message_sep": " ", + "inter_message_sep": " ", + "stop_token_ids": [ + 2 + ] + } + }, { "version": 1, "context_length": 2048,