From 6e44d6b3933d2bafc513b4541ab9bec5a2871f01 Mon Sep 17 00:00:00 2001
From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com>
Date: Tue, 5 Sep 2023 16:08:28 +0800
Subject: [PATCH] [Improve] Redesign convert tools (#96)

* refactor tools

* modify entry_point

* modify docs

* update docs

* fix

* fix

* Update README.md

* Update README.md

* Update README.md

* Update README_zh-CN.md

* fix pre-commit

* rename converter

* update pth2hf

* rename pth2hf to pth_to_hf

* add fp32 for pth_to_hf

* Update README.md

* Update README_zh-CN.md

* Update README_zh-CN.md

* Update README.md

* Update README_zh-CN.md

* Update README_zh-CN.md

* Update README.md

* Update README.md

* Update README_zh-CN.md

* fix pre-commit
---
 README.md                                     |  99 ++++----
 README_zh-CN.md                               |  99 ++++----
 docs/en/user_guides/chat.md                   |  36 +--
 docs/zh_cn/user_guides/chat.md                |  36 +--
 xtuner/entry_point.py                         |  73 ++----
 xtuner/tools/chat.py                          |  97 ++++----
 xtuner/tools/chat_hf.py                       | 235 ------------------
 .../tools/model_converters/adapter_pth2hf.py  |  79 ------
 .../{merge_adapter_hf.py => merge.py}         |   7 +-
 .../tools/model_converters/merge_adapter.py   |  79 ------
 xtuner/tools/model_converters/pth_to_hf.py    | 108 ++++++++
 .../{split_hf_llm.py => split.py}             |   0
 12 files changed, 328 insertions(+), 620 deletions(-)
 delete mode 100644 xtuner/tools/chat_hf.py
 delete mode 100644 xtuner/tools/model_converters/adapter_pth2hf.py
 rename xtuner/tools/model_converters/{merge_adapter_hf.py => merge.py} (87%)
 delete mode 100644 xtuner/tools/model_converters/merge_adapter.py
 create mode 100644 xtuner/tools/model_converters/pth_to_hf.py
 rename xtuner/tools/model_converters/{split_hf_llm.py => split.py} (100%)
diff --git a/README.md b/README.md
index b7f06adbe..d28730113 100644
--- a/README.md
+++ b/README.md
@@ -26,9 +26,28 @@ XTuner is a toolkit for efficiently fine-tuning LLM, developed by the [MMRazor](
 
 ## 🌟 Demos
 
+- Ready-to-use models and datasets from XTuner API [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eBI9yiOkX-t7P-0-t9vS8y1x5KmWrkoU?usp=sharing)
+
 - QLoRA Fine-tune [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QAEZVBfQ7LZURkMUtaq0b-5nEQII9G9Z?usp=sharing)
+
 - Plugin-based Chat [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/144OuTVyT_GvFyDMtlSlTzcxYIfnRsklq?usp=sharing)
-- Ready-to-use models and datasets from XTuner API [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eBI9yiOkX-t7P-0-t9vS8y1x5KmWrkoU?usp=sharing)
+
+  <table>
+  <tr>
+    <th colspan="3" align="center">Examples of Plugin-based Chat 🔥🔥🔥</th>
+  </tr>
+  <tr>
+  <td>
+  <a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/7c429d98-7630-4539-8aff-c89094826f8c"></a>
+  </td>
+  <td>
+  <a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/05d02906-5a82-45bc-b4e3-2cc32d473b2c"></a>
+  </td>
+  <td>
+  <a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/80395303-997a-47f2-b7d2-d585034df683"></a>
+  </td>
+  </tr>
+  </table>
 
 ## 🔥 Supports
 
@@ -123,35 +142,6 @@ XTuner is a toolkit for efficiently fine-tuning LLM, developed by the [MMRazor](
   pip install -e '.[all]'
   ```
 
-### Chat [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/144OuTVyT_GvFyDMtlSlTzcxYIfnRsklq?usp=sharing)
-
-<table>
-<tr>
-  <th colspan="3" align="center">Examples of Plugins-based Chat 🔥🔥🔥</th>
-</tr>
-<tr>
-<td>
-<a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/7c429d98-7630-4539-8aff-c89094826f8c"></a>
-</td>
-<td>
-<a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/05d02906-5a82-45bc-b4e3-2cc32d473b2c"></a>
-</td>
-<td>
-<a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/80395303-997a-47f2-b7d2-d585034df683"></a>
-</td>
-</tr>
-</table>
-
-XTuner provides tools to chat with pretrained / fine-tuned LLMs.
-
-- For example, we can start the chat with Llama2-7B-Plugins by
-
-  ```shell
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
-  ```
-
-For more examples, please see [chat.md](./docs/en/user_guides/chat.md).
-
 ### Fine-tune [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QAEZVBfQ7LZURkMUtaq0b-5nEQII9G9Z?usp=sharing)
 
 XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. Dataset prepare guides can be found on [dataset_prepare.md](./docs/en/user_guides/dataset_prepare.md).
@@ -165,10 +155,16 @@ XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. Dataset prepar
   Or, if the provided configs cannot meet the requirements, please copy the provided config to the specified directory and make specific modifications by
 
   ```shell
-  xtuner copy-cfg ${CONFIG_NAME} ${SAVE_DIR}
+  xtuner copy-cfg ${CONFIG_NAME} ${SAVE_PATH}
   ```
 
-- **Step 1**, start fine-tuning. For example, we can start the QLoRA fine-tuning of InternLM-7B with oasst1 dataset by
+- **Step 1**, start fine-tuning.
+
+  ```shell
+  xtuner train ${CONFIG_NAME_OR_PATH}
+  ```
+
+  For example, we can start the QLoRA fine-tuning of InternLM-7B with oasst1 dataset by
 
   ```shell
   # On a single GPU
@@ -180,24 +176,37 @@ XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. Dataset prepar
 
   For more examples, please see [finetune.md](./docs/en/user_guides/finetune.md).
 
-### Deployment
-
-- **Step 0**, convert the pth adapter to HuggingFace adapter, by
+- **Step 2**, convert the saved PTH model (if using DeepSpeed, it will be a directory) to HuggingFace model, by
 
   ```shell
-  xtuner convert adapter_pth2hf \
-      ${CONFIG} \
-      ${PATH_TO_PTH_ADAPTER} \
-      ${SAVE_PATH_TO_HF_ADAPTER}
+  xtuner convert pth_to_hf ${CONFIG_NAME_OR_PATH} ${PTH} ${SAVE_PATH}
   ```
 
-  or, directly merge the pth adapter to pretrained LLM, by
+### Chat [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/144OuTVyT_GvFyDMtlSlTzcxYIfnRsklq?usp=sharing)
+
+XTuner provides tools to chat with pretrained / fine-tuned LLMs.
+
+```shell
+xtuner chat ${NAME_OR_PATH_TO_LLM} --adapter {NAME_OR_PATH_TO_ADAPTER} [optional arguments]
+```
+
+For example, we can start the chat with Llama2-7b with adapter trained from MOSS-003-SFT by
+
+```shell
+xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
+```
+
+For more examples, please see [chat.md](./docs/en/user_guides/chat.md).
+
+### Deployment
+
+- **Step 0**, merge the HuggingFace adapter to pretrained LLM, by
 
   ```shell
   xtuner convert merge_adapter \
-      ${CONFIG} \
-      ${PATH_TO_PTH_ADAPTER} \
-      ${SAVE_PATH_TO_MERGED_LLM} \
+      ${NAME_OR_PATH_TO_LLM} \
+      ${NAME_OR_PATH_TO_ADAPTER} \
+      ${SAVE_PATH} \
       --max-shard-size 2GB
   ```
 
@@ -212,7 +221,9 @@ XTuner supports the efficient fine-tune (*e.g.*, QLoRA) for LLMs. Dataset prepar
       --seed 0
   ```
 
-  🎯 We are woking closely with [LMDeploy](https://github.com/InternLM/lmdeploy), to implement the deployment of **plugins-based chat**!
+  🔥 Seeking efficient inference with less GPU memory? Try 4-bit quantization from [LMDeploy](https://github.com/InternLM/lmdeploy)! For more details, see [here](https://github.com/InternLM/lmdeploy/tree/main#quantization).
+
+  🎯 We are woking closely with [LMDeploy](https://github.com/InternLM/lmdeploy), to implement the deployment of **plugin-based chat**!
 
 ### Evaluation
 
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 30f512647..0ca037b7a 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -26,9 +26,28 @@ XTuner 是一个轻量级微调大语言模型的工具库，由 [MMRazor](https
 
 ## 🌟 示例
 
+- XTuner APIs所提供的开箱即用的模型与数据集 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eBI9yiOkX-t7P-0-t9vS8y1x5KmWrkoU?usp=sharing)
+
 - QLoRA 微调 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QAEZVBfQ7LZURkMUtaq0b-5nEQII9G9Z?usp=sharing)
+
 - 基于插件的对话 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/144OuTVyT_GvFyDMtlSlTzcxYIfnRsklq?usp=sharing)
-- XTuner APIs所提供的开箱即用的模型与数据集 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1eBI9yiOkX-t7P-0-t9vS8y1x5KmWrkoU?usp=sharing)
+
+  <table>
+  <tr>
+    <th colspan="3" align="center">基于插件的对话 🔥🔥🔥</th>
+  </tr>
+  <tr>
+  <td>
+  <a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/7c429d98-7630-4539-8aff-c89094826f8c"></a>
+  </td>
+  <td>
+  <a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/05d02906-5a82-45bc-b4e3-2cc32d473b2c"></a>
+  </td>
+  <td>
+  <a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/80395303-997a-47f2-b7d2-d585034df683"></a>
+  </td>
+  </tr>
+  </table>
 
 ## 🔥 支持列表
 
@@ -123,35 +142,6 @@ XTuner 是一个轻量级微调大语言模型的工具库，由 [MMRazor](https
   pip install -e '.[all]'
   ```
 
-### 对话 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/144OuTVyT_GvFyDMtlSlTzcxYIfnRsklq?usp=sharing)
-
-<table>
-<tr>
-  <th colspan="3" align="center">基于插件的对话 🔥🔥🔥</th>
-</tr>
-<tr>
-<td>
-<a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/7c429d98-7630-4539-8aff-c89094826f8c"></a>
-</td>
-<td>
-<a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/05d02906-5a82-45bc-b4e3-2cc32d473b2c"></a>
-</td>
-<td>
-<a><img src="https://github.com/InternLM/lmdeploy/assets/36994684/80395303-997a-47f2-b7d2-d585034df683"></a>
-</td>
-</tr>
-</table>
-
-XTuner 提供与大语言模型对话的工具。
-
-- 例如，与基于插件微调获得的 Llama2-7B-Plugins 对话：
-
-  ```shell
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
-  ```
-
-更多示例，请查阅[文档](./docs/zh_cn/user_guides/chat.md)。
-
 ### 微调 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1QAEZVBfQ7LZURkMUtaq0b-5nEQII9G9Z?usp=sharing)
 
 XTuner 支持微调大语言模型。数据集预处理指南请查阅[文档](./docs/zh_cn/user_guides/dataset_prepare.md)。
@@ -165,10 +155,16 @@ XTuner 支持微调大语言模型。数据集预处理指南请查阅[文档](.
   或者，如果所提供的配置文件不能满足使用需求，请导出所提供的配置文件并进行相应更改：
 
   ```shell
-  xtuner copy-cfg ${CONFIG_NAME} ${SAVE_DIR}
+  xtuner copy-cfg ${CONFIG_NAME} ${SAVE_PATH}
   ```
 
-- **步骤 1**，开始微调。例如，我们可以利用 QLoRA 算法在 oasst1 数据集上微调 InternLM-7B：
+- **步骤 1**，开始微调。
+
+  ```shell
+  xtuner train ${CONFIG_NAME_OR_PATH}
+  ```
+
+  例如，我们可以利用 QLoRA 算法在 oasst1 数据集上微调 InternLM-7B：
 
   ```shell
   # 单卡
@@ -177,26 +173,39 @@ XTuner 支持微调大语言模型。数据集预处理指南请查阅[文档](.
   NPROC_PER_NODE=${GPU_NUM} xtuner train internlm_7b_qlora_oasst1_e3
   ```
 
-  更多示例，请查阅[文档](./docs/zh_cn/user_guides/finetune.md).
+  更多示例，请查阅[文档](./docs/zh_cn/user_guides/finetune.md)。
 
-### 部署
-
-- **步骤 0**，将 pth adapter 转换为 HuggingFace adapter：
+- **步骤 2**，将保存的 PTH 模型（如果使用的DeepSpeed，则将会是一个文件夹）转换为 HuggingFace 模型：
 
   ```shell
-  xtuner convert adapter_pth2hf \
-      ${CONFIG} \
-      ${PATH_TO_PTH_ADAPTER} \
-      ${SAVE_PATH_TO_HF_ADAPTER}
+  xtuner convert pth_to_hf ${CONFIG_NAME_OR_PATH} ${PTH} ${SAVE_PATH}
   ```
 
-  或者，直接将 pth adapter 合并到大语言模型：
+### 对话 [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/144OuTVyT_GvFyDMtlSlTzcxYIfnRsklq?usp=sharing)
+
+XTuner 提供与大语言模型对话的工具。
+
+```shell
+xtuner chat ${NAME_OR_PATH_TO_LLM} --adapter {NAME_OR_PATH_TO_ADAPTER} [optional arguments]
+```
+
+例如，与 Llama2-7b + MOSS-003-SFT adapter 对话：
+
+```shell
+xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
+```
+
+更多示例，请查阅[文档](./docs/zh_cn/user_guides/chat.md)。
+
+### 部署
+
+- **步骤 0**，将 HuggingFace adapter 合并到大语言模型：
 
   ```shell
   xtuner convert merge_adapter \
-      ${CONFIG} \
-      ${PATH_TO_PTH_ADAPTER} \
-      ${SAVE_PATH_TO_MERGED_LLM} \
+      ${NAME_OR_PATH_TO_LLM} \
+      ${NAME_OR_PATH_TO_ADAPTER} \
+      ${SAVE_PATH} \
       --max-shard-size 2GB
   ```
 
@@ -211,6 +220,8 @@ XTuner 支持微调大语言模型。数据集预处理指南请查阅[文档](.
       --seed 0
   ```
 
+  🔥 追求速度更快、显存占用更低的推理？欢迎体验 [LMDeploy](https://github.com/InternLM/lmdeploy) 提供的 4-bit 量化！使用指南请见[文档](https://github.com/InternLM/lmdeploy/tree/main#quantization)。
+
   🎯 我们正在与 [LMDeploy](https://github.com/InternLM/lmdeploy) 紧密合作，以实现基于插件对话的部署！
 
 ### 评测
diff --git a/docs/en/user_guides/chat.md b/docs/en/user_guides/chat.md
index 2914296f0..65725f7a3 100644
--- a/docs/en/user_guides/chat.md
+++ b/docs/en/user_guides/chat.md
@@ -5,49 +5,49 @@
 - InternLM-7B, oasst1
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-oasst1 --prompt-template openassistant
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-oasst1 --prompt-template openassistant
   ```
 
 - InternLM-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-arxiv-gentitle --prompt-template title
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-arxiv-gentitle --prompt-template title
   ```
 
 - InternLM-7B, Colorist
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-colorist --prompt-template colorist
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-colorist --prompt-template colorist
   ```
 
 - InternLM-7B, Coder
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-coder --prompt-template code
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-coder --prompt-template code
   ```
 
 - InternLM-7B, SQL
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-sql --prompt-template sql
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-sql --prompt-template sql
   ```
 
 - InternLM-7B, Lawyer
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-lawyer --prompt-template lawyer
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-lawyer --prompt-template lawyer
   ```
 
 - InternLM-7B, Open-Platypus
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-open-platypus --prompt-template alpaca
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-open-platypus --prompt-template alpaca
   ```
 
 - InternLM-7B, Alpaca-enzh
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-alpaca-enzh --prompt-template alpaca
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-alpaca-enzh --prompt-template alpaca
   ```
 
 ## Chat with [Llama2](https://github.com/facebookresearch/llama)
@@ -58,19 +58,19 @@
 
   ```shell
   export SERPER_API_KEY="xxx"  # Please get the key from https://serper.dev to support google search!
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
+  xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
   ```
 
 - Llama2-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-arxiv-gentitle --prompt-template title
+  xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-arxiv-gentitle --prompt-template title
   ```
 
 - Llama2-7B, Colorist
 
   ```shell
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-colorist --prompt-template colorist
+  xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-colorist --prompt-template colorist
   ```
 
 ## Chat with [Qwen](https://github.com/QwenLM)
@@ -79,25 +79,25 @@
 
   ```shell
   export SERPER_API_KEY="xxx"  # Please get the key from https://serper.dev to support google search!
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-moss-003-sft --bot-name Qwen --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>"
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-moss-003-sft --bot-name Qwen --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>"
   ```
 
 - Qwen-7B, oasst1
 
   ```shell
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-oasst1 --prompt-template openassistant --answer-stop-word '<|endoftext|>'
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-oasst1 --prompt-template openassistant --answer-stop-word '<|endoftext|>'
   ```
 
 - Qwen-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-arxiv-gentitle --prompt-template title --answer-stop-word '<|endoftext|>'
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-arxiv-gentitle --prompt-template title --answer-stop-word '<|endoftext|>'
   ```
 
 - Qwen-7B, Alpaca-enzh
 
   ```shell
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-alpaca-enzh --prompt-template alpaca --answer-stop-word '<|endoftext|>'
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-alpaca-enzh --prompt-template alpaca --answer-stop-word '<|endoftext|>'
   ```
 
 ## Chat with [Baichuan](https://github.com/baichuan-inc)
@@ -105,17 +105,17 @@
 - Baichuan-7B, oasst1
 
   ```shell
-  xtuner chat hf baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-oasst1 --prompt-template openassistant
+  xtuner chat baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-oasst1 --prompt-template openassistant
   ```
 
 - Baichuan-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-arxiv-gentitle --prompt-template title --no-streamer
+  xtuner chat baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-arxiv-gentitle --prompt-template title --no-streamer
   ```
 
 - Baichuan-7B, Alpaca-enzh
 
   ```shell
-  xtuner chat hf baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-alpaca-enzh --prompt-template alpaca
+  xtuner chat baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-alpaca-enzh --prompt-template alpaca
   ```
diff --git a/docs/zh_cn/user_guides/chat.md b/docs/zh_cn/user_guides/chat.md
index 1ae01388b..6fef1684c 100644
--- a/docs/zh_cn/user_guides/chat.md
+++ b/docs/zh_cn/user_guides/chat.md
@@ -5,49 +5,49 @@
 - InternLM-7B, oasst1
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-oasst1 --prompt-template openassistant
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-oasst1 --prompt-template openassistant
   ```
 
 - InternLM-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-arxiv-gentitle --prompt-template title
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-arxiv-gentitle --prompt-template title
   ```
 
 - InternLM-7B, Colorist
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-colorist --prompt-template colorist
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-colorist --prompt-template colorist
   ```
 
 - InternLM-7B, Coder
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-coder --prompt-template code
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-coder --prompt-template code
   ```
 
 - InternLM-7B, SQL
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-sql --prompt-template sql
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-sql --prompt-template sql
   ```
 
 - InternLM-7B, Lawyer
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-lawyer --prompt-template lawyer
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-lawyer --prompt-template lawyer
   ```
 
 - InternLM-7B, Open-Platypus
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-open-platypus --prompt-template alpaca
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-open-platypus --prompt-template alpaca
   ```
 
 - InternLM-7B, Alpaca-enzh
 
   ```shell
-  xtuner chat hf internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-alpaca-enzh --prompt-template alpaca
+  xtuner chat internlm/internlm-7b --adapter xtuner/internlm-7b-qlora-alpaca-enzh --prompt-template alpaca
   ```
 
 ## 与微调后的 [Llama2](https://github.com/facebookresearch/llama) 对话
@@ -58,19 +58,19 @@
 
   ```shell
   export SERPER_API_KEY="xxx"  # 请从 https://serper.dev 获得API_KEY，以此支持谷歌搜索！
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
+  xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-moss-003-sft --bot-name Llama2 --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>" --no-streamer
   ```
 
 - Llama2-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-arxiv-gentitle --prompt-template title
+  xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-arxiv-gentitle --prompt-template title
   ```
 
 - Llama2-7B, Colorist
 
   ```shell
-  xtuner chat hf meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-colorist --prompt-template colorist
+  xtuner chat meta-llama/Llama-2-7b-hf --adapter xtuner/Llama-2-7b-qlora-colorist --prompt-template colorist
   ```
 
 ## 与微调后的 [Qwen](https://github.com/QwenLM) 对话
@@ -79,25 +79,25 @@
 
   ```shell
   export SERPER_API_KEY="xxx"  # 请从 https://serper.dev 获得API_KEY，以此支持谷歌搜索！
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-moss-003-sft --bot-name Qwen --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>"
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-moss-003-sft --bot-name Qwen --prompt-template moss_sft --with-plugins calculate solve search --command-stop-word "<eoc>" --answer-stop-word "<eom>"
   ```
 
 - Qwen-7B, oasst1
 
   ```shell
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-oasst1 --prompt-template openassistant --answer-stop-word '<|endoftext|>'
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-oasst1 --prompt-template openassistant --answer-stop-word '<|endoftext|>'
   ```
 
 - Qwen-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-arxiv-gentitle --prompt-template title --answer-stop-word '<|endoftext|>'
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-arxiv-gentitle --prompt-template title --answer-stop-word '<|endoftext|>'
   ```
 
 - Qwen-7B, Alpaca-enzh
 
   ```shell
-  xtuner chat hf Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-alpaca-enzh --prompt-template alpaca --answer-stop-word '<|endoftext|>'
+  xtuner chat Qwen/Qwen-7B --adapter xtuner/Qwen-7B-qlora-alpaca-enzh --prompt-template alpaca --answer-stop-word '<|endoftext|>'
   ```
 
 ## 与微调后的 [Baichuan](https://github.com/baichuan-inc) 对话
@@ -105,17 +105,17 @@
 - Baichuan-7B, oasst1
 
   ```shell
-  xtuner chat hf baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-oasst1 --prompt-template openassistant
+  xtuner chat baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-oasst1 --prompt-template openassistant
   ```
 
 - Baichuan-7B, Arxiv Gentitle
 
   ```shell
-  xtuner chat hf baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-arxiv-gentitle --prompt-template title --no-streamer
+  xtuner chat baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-arxiv-gentitle --prompt-template title --no-streamer
   ```
 
 - Baichuan-7B, Alpaca-enzh
 
   ```shell
-  xtuner chat hf baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-alpaca-enzh --prompt-template alpaca
+  xtuner chat baichuan-inc/Baichuan-7B --adapter xtuner/Baichuan-7B-qlora-alpaca-enzh --prompt-template alpaca
   ```
diff --git a/xtuner/entry_point.py b/xtuner/entry_point.py
index 628610d38..f36f2cc39 100644
--- a/xtuner/entry_point.py
+++ b/xtuner/entry_point.py
@@ -8,10 +8,9 @@
 from mmengine.logging import print_log
 
 import xtuner
-from xtuner.tools import chat, chat_hf, copy_cfg, list_cfg, test, train
+from xtuner.tools import chat, copy_cfg, list_cfg, test, train
 from xtuner.tools.data_preprocess import arxiv as arxiv_preprocess
-from xtuner.tools.model_converters import (adapter_pth2hf, merge_adapter,
-                                           merge_adapter_hf, split_hf_llm)
+from xtuner.tools.model_converters import merge, pth_to_hf, split
 
 # Define valid modes
 MODES = ('list-cfg', 'copy-cfg', 'train', 'test', 'chat', 'convert',
@@ -37,18 +36,14 @@
             xtuner train $CONFIG
         3-2. Fine-tune LLMs by multiple GPUs:
             NPROC_PER_NODE=$NGPUS NNODES=$NNODES NODE_RANK=$NODE_RANK PORT=$PORT ADDR=$ADDR xtuner dist_train $CONFIG $GPUS
-        4-1. Chat with LLMs with HuggingFace's model and adapter:
-            xtuner chat hf $NAME_OR_PATH_TO_HF_MODEL --adapter $NAME_OR_PATH_TO_HF_ADAPTER --prompt-template $PROMPT_TEMPLATE
-        4-2. Chat with LLMs with XTuner's config and adapter:
-            xtuner chat xtuner $CONFIG --adapter $PATH_TO_PTH_ADAPTER --prompt $PROMPT_TEMPLATE
-        5-1. Convert the pth adapter to HuggingFace's adapter:
-            xtuner convert adapter_pth2hf $CONFIG $PATH_TO_PTH_ADAPTER $SAVE_PATH_TO_HF_ADAPTER
-        5-2. Merge the HuggingFace's adapter to the pretrained LLM:
-            xtuner convert merge_adapter_hf $NAME_OR_PATH_TO_HF_MODEL $NAME_OR_PATH_TO_HF_ADAPTER $SAVE_PATH
-        5-3. Merge the XTuner's adapter to the pretraiend LLM:
-            xtuner convert merge_adapter $CONFIG $NAME_OR_PATH_TO_HF_ADAPTER $SAVE_PATH
-        5-4. Split HuggingFace's LLM to the smallest sharded one:
-            xtuner convert split_hf_llm $NAME_OR_PATH_TO_HF_MODEL $SAVE_PATH
+        4-1. Convert the pth model to HuggingFace's model:
+            xtuner convert pth_to_hf $CONFIG $PATH_TO_PTH_MODEL $SAVE_PATH_TO_HF_MODEL
+        4-2. Merge the HuggingFace's adapter to the pretrained LLM:
+            xtuner convert merge $NAME_OR_PATH_TO_LLM $NAME_OR_PATH_TO_ADAPTER $SAVE_PATH
+        4-3. Split HuggingFace's LLM to the smallest sharded one:
+            xtuner convert split $NAME_OR_PATH_TO_LLM $SAVE_PATH
+        5. Chat with LLMs with HuggingFace's model and adapter:
+            xtuner chat $NAME_OR_PATH_TO_LLM --adapter $NAME_OR_PATH_TO_ADAPTER --prompt-template $PROMPT_TEMPLATE
         6-1. Preprocess arxiv dataset:
             xtuner preprocess arxiv $SRC_FILE $DST_FILE --start-date $START_DATE --categories $CATEGORIES
 
@@ -73,15 +68,12 @@
 
     Some usages for convert: (See more by using -h for specific command!)
 
-        1. Convert the pth adapter to HuggingFace's adapter:
-            xtuner convert adapter_pth2hf $CONFIG $PATH_TO_PTH_ADAPTER $SAVE_PATH_TO_HF_ADAPTER
+        1. Convert the pth model to HuggingFace's model:
+            xtuner convert pth_to_hf $CONFIG $PATH_TO_PTH_MODEL $SAVE_PATH_TO_HF_MODEL
         2. Merge the HuggingFace's adapter to the pretrained LLM:
-            xtuner convert merge_adapter_hf $NAME_OR_PATH_TO_HF_MODEL $NAME_OR_PATH_TO_HF_ADAPTER $SAVE_PATH
-        3. Merge the XTuner's
-        adapter to the pretraiend LLM:
-            xtuner convert merge_adapter $CONFIG $NAME_OR_PATH_TO_HF_ADAPTER $SAVE_PATH
-        4. Split HuggingFace's LLM to the smallest sharded one:
-            xtuner convert split_hf_llm $NAME_OR_PATH_TO_HF_MODEL $SAVE_PATH
+            xtuner convert merge $NAME_OR_PATH_TO_LLM $NAME_OR_PATH_TO_ADAPTER $SAVE_PATH
+        3. Split HuggingFace's LLM to the smallest sharded one:
+            xtuner convert split $NAME_OR_PATH_TO_LLM $SAVE_PATH
 
     GitHub: https://github.com/InternLM/xtuner
     """  # noqa: E501
@@ -105,27 +97,6 @@
     GitHub: https://github.com/InternLM/xtuner
     """  # noqa: E501
 
-
-CHAT_HELP_MSG = \
-    f"""
-    Arguments received: {str(['xtuner'] + sys.argv[1:])}. xtuner commands use the following syntax:
-
-        xtuner MODE MODE_ARGS ARGS
-
-        Where   MODE (required) is one of {MODES}
-                MODE_ARG (optional) is the argument for specific mode
-                ARGS (optional) are the arguments for specific command
-
-    Some usages for chat: (See more by using -h for specific command!)
-
-        1. Chat with LLMs with HuggingFace's model and adapter:
-            xtuner chat hf $NAME_OR_PATH_TO_HF_MODEL --adapter $NAME_OR_PATH_TO_HF_ADAPTER --prompt-template $PROMPT_TEMPLATE
-        2. Chat with LLMs with XTuner's config and adapter:
-            xtuner chat xtuner internlm_7b_qlora_alpaca --adapter $PATH_TO_PTH_ADAPTER --prompt $PROMPT_TEMPLATE
-
-    GitHub: https://github.com/InternLM/xtuner
-    """  # noqa: E501
-
 special = {
     'help': lambda: print_log(CLI_HELP_MSG, 'current'),
     'version': lambda: print_log(xtuner.__version__, 'current')
@@ -143,17 +114,11 @@
     'copy-cfg': copy_cfg.__file__,
     'train': train.__file__,
     'test': test.__file__,
-    'chat': {
-        'hf': chat_hf.__file__,
-        'xtuner': chat.__file__,
-        '--help': lambda: print_log(CHAT_HELP_MSG, 'current'),
-        '-h': lambda: print_log(CHAT_HELP_MSG, 'current')
-    },
+    'chat': chat.__file__,
     'convert': {
-        'adapter_pth2hf': adapter_pth2hf.__file__,
-        'merge_adapter': merge_adapter.__file__,
-        'merge_adapter_hf': merge_adapter_hf.__file__,
-        'split_hf_llm': split_hf_llm.__file__,
+        'pth_to_hf': pth_to_hf.__file__,
+        'merge': merge.__file__,
+        'split': split.__file__,
         '--help': lambda: print_log(CONVERT_HELP_MSG, 'current'),
         '-h': lambda: print_log(CONVERT_HELP_MSG, 'current')
     },
diff --git a/xtuner/tools/chat.py b/xtuner/tools/chat.py
index db3128e65..f3cf452c6 100644
--- a/xtuner/tools/chat.py
+++ b/xtuner/tools/chat.py
@@ -1,35 +1,45 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
-import os
 import re
 
 import torch
-from mmengine.config import Config, DictAction
-from transformers import GenerationConfig
+from peft import PeftModel
+from transformers import (AutoModelForCausalLM, AutoTokenizer,
+                          BitsAndBytesConfig, GenerationConfig)
 
-from xtuner.configs import cfgs_name_path
-from xtuner.registry import BUILDER
 from xtuner.tools.utils import get_chat_utils, update_stop_criteria
 from xtuner.utils import PROMPT_TEMPLATE
 
 
+def remove_prefix(state_dict, prefix):
+    new_state_dict = {}
+    for key, value in state_dict.items():
+        if key.startswith(prefix):
+            new_key = key[len(prefix):]
+            new_state_dict[new_key] = value
+        else:
+            new_state_dict[key] = value
+    return new_state_dict
+
+
 def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Chat with a pretrained model')
+    parser = argparse.ArgumentParser(description='Chat with a HF model')
     parser.add_argument(
-        'config',
-        help='config file name or path. Note: Please use the original '
-        'configs, instead of the automatically saved log configs.')
-    parser.add_argument('--adapter', default=None, help='adapter model')
+        'model_name_or_path', help='Hugging Face model name or path')
+    parser.add_argument('--adapter', default=None, help='adapter name or path')
     parser.add_argument(
         '--prompt-template',
         choices=PROMPT_TEMPLATE.keys(),
         default=None,
         help='Specify a prompt option')
     parser.add_argument(
-        '--is-deepspeed',
-        action='store_true',
-        help='whether the adapter is saved from deepspeed')
+        '--bits',
+        type=int,
+        choices=[4, 8, None],
+        default=None,
+        help='LLM bits')
+    parser.add_argument(
+        '--bot-name', type=str, default='BOT', help='Name for Bot')
     parser.add_argument(
         '--with-plugins',
         nargs='+',
@@ -67,16 +77,6 @@ def parse_args():
         type=int,
         default=0,
         help='Random seed for reproducible text generation')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. If the value to '
-        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
-        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
-        'Note that the quotation marks are necessary and that no white space '
-        'is allowed.')
     args = parser.parse_args()
     return args
 
@@ -119,29 +119,30 @@ def main():
 
     torch.manual_seed(args.seed)
 
-    # parse config
-    if not os.path.isfile(args.config):
-        try:
-            args.config = cfgs_name_path[args.config]
-        except KeyError:
-            raise FileNotFoundError(f'Cannot find {args.config}')
-
-    # load config
-    cfg = Config.fromfile(args.config)
-    if args.cfg_options is not None:
-        cfg.merge_from_dict(args.cfg_options)
-
-    model = BUILDER.build(cfg.model)
-    # Cast to inference mode
-    model.llm.gradient_checkpointing_disable()
-    model.llm.config.use_cache = True
-
-    tokenizer = BUILDER.build(cfg.tokenizer)
-
+    # build model
+    quantization_config = None
+    load_in_8bit = False
+    if args.bits == 4:
+        quantization_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            load_in_8bit=False,
+            llm_int8_threshold=6.0,
+            llm_int8_has_fp16_weight=False,
+            bnb_4bit_compute_dtype=torch.float16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type='nf4')
+    elif args.bits == 8:
+        load_in_8bit = True
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_name_or_path,
+        quantization_config=quantization_config,
+        load_in_8bit=load_in_8bit,
+        device_map='auto',
+        trust_remote_code=True)
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name_or_path, trust_remote_code=True)
     if args.adapter is not None:
-        adapter = torch.load(args.adapter, map_location='cpu')
-        state_dict_key = 'module' if args.is_deepspeed else 'state_dict'
-        model.load_state_dict(adapter[state_dict_key], strict=False)
+        model = PeftModel.from_pretrained(model, args.adapter)
         print(f'Load adapter from {args.adapter}')
 
     Streamer, stop_criteria = get_chat_utils(model)
@@ -173,10 +174,10 @@ def main():
             template = PROMPT_TEMPLATE[args.prompt_template]
             if 'INSTRUCTION_START' in template and n_turn == 0:
                 prompt_text = template['INSTRUCTION_START'].format(
-                    input=text, round=n_turn + 1, **cfg)
+                    input=text, round=n_turn + 1, bot_name=args.bot_name)
             else:
                 prompt_text = template['INSTRUCTION'].format(
-                    input=text, round=n_turn + 1, **cfg)
+                    input=text, round=n_turn + 1, bot_name=args.bot_name)
             if args.prompt_template == 'moss_sft':
                 if not inner_thoughts_open:
                     prompt_text.replace('- Inner thoughts: enabled.',
diff --git a/xtuner/tools/chat_hf.py b/xtuner/tools/chat_hf.py
deleted file mode 100644
index 7cc81d89a..000000000
--- a/xtuner/tools/chat_hf.py
+++ /dev/null
@@ -1,235 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import re
-
-import torch
-from peft import PeftModel
-from transformers import (AutoModelForCausalLM, AutoTokenizer,
-                          BitsAndBytesConfig, GenerationConfig)
-
-from xtuner.tools.utils import get_chat_utils, update_stop_criteria
-from xtuner.utils import PROMPT_TEMPLATE
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Chat with a HF model')
-    parser.add_argument(
-        'model_name_or_path', help='Hugging Face model name or path')
-    parser.add_argument('--adapter', default=None, help='adapter name or path')
-    parser.add_argument(
-        '--prompt-template',
-        choices=PROMPT_TEMPLATE.keys(),
-        default=None,
-        help='Specify a prompt option')
-    parser.add_argument(
-        '--bot-name', type=str, default='BOT', help='Name for Bot')
-    parser.add_argument(
-        '--with-plugins',
-        nargs='+',
-        choices=['calculate', 'solve', 'search'],
-        help='Specify plugins to use')
-    parser.add_argument(
-        '--no-streamer', action='store_true', help='Whether to with streamer')
-    parser.add_argument('--command-stop-word', default=None, help='Stop key')
-    parser.add_argument('--answer-stop-word', default=None, help='Stop key')
-    parser.add_argument(
-        '--max-new-tokens',
-        type=int,
-        default=2048,
-        help='Maximum number of new tokens allowed in generated text')
-    parser.add_argument(
-        '--temperature',
-        type=float,
-        default=0.1,
-        help='The value used to modulate the next token probabilities.')
-    parser.add_argument(
-        '--top-k',
-        type=int,
-        default=40,
-        help='The number of highest probability vocabulary tokens to '
-        'keep for top-k-filtering.')
-    parser.add_argument(
-        '--top-p',
-        type=float,
-        default=0.75,
-        help='If set to float < 1, only the smallest set of most probable '
-        'tokens with probabilities that add up to top_p or higher are '
-        'kept for generation.')
-    parser.add_argument(
-        '--seed',
-        type=int,
-        default=0,
-        help='Random seed for reproducible text generation')
-    args = parser.parse_args()
-    return args
-
-
-def get_input():
-    """Helper function for getting input from users."""
-    sentinel = ''  # ends when this string is seen
-    result = None
-    while result is None:
-        print('\ndouble enter to end input >>> ', end='')
-        try:
-            result = '\n'.join(iter(input, sentinel))
-        except UnicodeDecodeError:
-            print('Invalid characters detected. Please enter again.')
-    return result
-
-
-def main():
-    args = parse_args()
-
-    if args.with_plugins is None:
-        inner_thoughts_open = False
-        calculate_open = False
-        solve_open = False
-        search_open = False
-    else:
-        assert args.prompt_template == 'moss_sft'
-        from plugins import plugins_api
-        inner_thoughts_open = True
-        calculate_open = 'calculate' in args.with_plugins
-        solve_open = 'solve' in args.with_plugins
-        search_open = 'search' in args.with_plugins
-        # pre-import for api and model preparation
-        if calculate_open:
-            from plugins import calculate  # noqa: F401
-        if solve_open:
-            from plugins import solve  # noqa: F401
-        if search_open:
-            from plugins import search  # noqa: F401
-
-    torch.manual_seed(args.seed)
-
-    # build model
-    quantization_config = BitsAndBytesConfig(
-        load_in_4bit=True,
-        load_in_8bit=False,
-        llm_int8_threshold=6.0,
-        llm_int8_has_fp16_weight=False,
-        bnb_4bit_compute_dtype=torch.float16,
-        bnb_4bit_use_double_quant=True,
-        bnb_4bit_quant_type='nf4')
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model_name_or_path,
-        quantization_config=quantization_config,
-        trust_remote_code=True)
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.model_name_or_path, trust_remote_code=True)
-    if args.adapter is not None:
-        model = PeftModel.from_pretrained(model, args.adapter)
-        print(f'Load adapter from {args.adapter}')
-
-    Streamer, stop_criteria = get_chat_utils(model)
-    if args.no_streamer:
-        Streamer = None
-
-    command_stop_cr, answer_stop_cr = update_stop_criteria(
-        base=stop_criteria,
-        tokenizer=tokenizer,
-        command_stop_word=args.command_stop_word,
-        answer_stop_word=args.answer_stop_word)
-
-    gen_config = GenerationConfig(
-        max_new_tokens=args.max_new_tokens,
-        do_sample=args.temperature > 0,
-        temperature=args.temperature,
-        top_p=args.top_p,
-        top_k=args.top_k,
-    )
-
-    n_turn = 0
-    inputs = ''
-    while True:
-        text = get_input()
-
-        if text == 'exit':
-            exit(0)
-        if args.prompt_template is not None:
-            template = PROMPT_TEMPLATE[args.prompt_template]
-            if 'INSTRUCTION_START' in template and n_turn == 0:
-                prompt_text = template['INSTRUCTION_START'].format(
-                    input=text, round=n_turn + 1, bot_name=args.bot_name)
-            else:
-                prompt_text = template['INSTRUCTION'].format(
-                    input=text, round=n_turn + 1, bot_name=args.bot_name)
-            if args.prompt_template == 'moss_sft':
-                if not inner_thoughts_open:
-                    prompt_text.replace('- Inner thoughts: enabled.',
-                                        '- Inner thoughts: disabled.')
-                if not calculate_open:
-                    prompt_text.replace(
-                        '- Calculator: enabled. API: Calculate(expression)',
-                        '- Calculator: disabled.')
-                if not solve_open:
-                    prompt_text.replace(
-                        '- Equation solver: enabled. API: Solve(equation)',
-                        '- Equation solver: disabled.')
-                if not search_open:
-                    prompt_text.replace(
-                        '- Web search: enabled. API: Search(query)',
-                        '- Web search: disabled.')
-
-            inputs += prompt_text
-        else:
-            inputs += text
-        ids = tokenizer.encode(inputs, return_tensors='pt')
-        streamer = Streamer(tokenizer) if Streamer is not None else None
-        if args.with_plugins is not None:
-            generate_output = model.generate(
-                inputs=ids.cuda(),
-                generation_config=gen_config,
-                streamer=streamer,
-                stopping_criteria=command_stop_cr).cpu()
-            generate_output_text = tokenizer.decode(
-                generate_output[0][len(ids[0]):])
-            if streamer is None:
-                end = '' if generate_output_text[-1] == '\n' else '\n'
-                print(generate_output_text, end=end)
-            pattern = r'<\|Commands\|>:(.*?)<eoc>'
-            command_text = ', '.join(re.findall(pattern, generate_output_text))
-            extent_text = plugins_api(
-                command_text,
-                calculate_open=calculate_open,
-                solve_open=solve_open,
-                search_open=search_open)
-            end = '' if extent_text[-1] == '\n' else '\n'
-            print(extent_text, end=end)
-            extent_text_ids = tokenizer.encode(
-                extent_text, return_tensors='pt', add_special_tokens=False)
-            new_ids = torch.cat((generate_output, extent_text_ids), dim=1)
-            new_streamer = Streamer(
-                tokenizer) if Streamer is not None else None
-            generate_output = model.generate(
-                inputs=new_ids.cuda(),
-                generation_config=gen_config,
-                streamer=new_streamer,
-                stopping_criteria=answer_stop_cr)
-            if streamer is None:
-                output_text = tokenizer.decode(
-                    generate_output[0][len(new_ids[0]):])
-                end = '' if output_text[-1] == '\n' else '\n'
-                print(output_text, end=end)
-        else:
-            generate_output = model.generate(
-                inputs=ids.cuda(),
-                generation_config=gen_config,
-                streamer=streamer,
-                stopping_criteria=answer_stop_cr)
-            if streamer is None:
-                output_text = tokenizer.decode(
-                    generate_output[0][len(ids[0]):])
-                end = '' if output_text[-1] == '\n' else '\n'
-                print(output_text, end=end)
-        inputs = tokenizer.decode(generate_output[0])
-        n_turn += 1
-        if len(generate_output[0]) >= args.max_new_tokens:
-            print('Remove the memory of history responses, since '
-                  f'it exceeds the length limitation {args.max_new_tokens}.')
-            n_turn = 0
-            inputs = ''
-
-
-if __name__ == '__main__':
-    main()
diff --git a/xtuner/tools/model_converters/adapter_pth2hf.py b/xtuner/tools/model_converters/adapter_pth2hf.py
deleted file mode 100644
index 80ae2e209..000000000
--- a/xtuner/tools/model_converters/adapter_pth2hf.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os
-import shutil
-
-import torch
-from mmengine.config import Config, DictAction
-from mmengine.utils import mkdir_or_exist
-
-from xtuner.configs import cfgs_name_path
-from xtuner.registry import BUILDER
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Convert the pth adapter to HuggingFace adapter')
-    parser.add_argument(
-        'config',
-        help='config file name or path. Note: Please use the original '
-        'configs, instead of the automatically saved log configs.')
-    parser.add_argument('adapter_checkpoint', help='adapter checkpoint file')
-    parser.add_argument(
-        'save_dir', help='the directory to save the checkpoint')
-    parser.add_argument(
-        '--is-deepspeed',
-        action='store_true',
-        help='whether the adapter is saved from deepspeed')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. If the value to '
-        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
-        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
-        'Note that the quotation marks are necessary and that no white space '
-        'is allowed.')
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # parse config
-    if not os.path.isfile(args.config):
-        try:
-            args.config = cfgs_name_path[args.config]
-        except KeyError:
-            raise FileNotFoundError(f'Cannot find {args.config}')
-
-    # load config
-    cfg = Config.fromfile(args.config)
-    if args.cfg_options is not None:
-        cfg.merge_from_dict(args.cfg_options)
-
-    # load on cpu
-    cfg.model.llm.device_map = 'cpu'
-    if cfg.model.llm.get('quantization_config'):
-        cfg.model.llm.quantization_config.\
-            llm_int8_enable_fp32_cpu_offload = True
-
-    model = BUILDER.build(cfg.model)
-
-    adapter_checkpoint = torch.load(
-        args.adapter_checkpoint, map_location='cpu')
-    state_dict_key = 'module' if args.is_deepspeed else 'state_dict'
-    model.load_state_dict(adapter_checkpoint[state_dict_key], strict=False)
-    print(f'Load adapter from {args.adapter_checkpoint}')
-
-    mkdir_or_exist(args.save_dir)
-    model.llm.save_pretrained(args.save_dir)
-    shutil.copyfile(args.config, os.path.join(args.save_dir,
-                                              'xtuner_config.py'))
-    print(f'Save to {args.save_dir}')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/xtuner/tools/model_converters/merge_adapter_hf.py b/xtuner/tools/model_converters/merge.py
similarity index 87%
rename from xtuner/tools/model_converters/merge_adapter_hf.py
rename to xtuner/tools/model_converters/merge.py
index 2de6bc23a..169cea620 100644
--- a/xtuner/tools/model_converters/merge_adapter_hf.py
+++ b/xtuner/tools/model_converters/merge.py
@@ -13,7 +13,12 @@ def parse_args():
     parser.add_argument('adapter_name_or_path', help='adapter name or path')
     parser.add_argument(
         'save_dir', help='the directory to save the merged model')
-    parser.add_argument('--max-shard-size', type=str, default='2GB')
+    parser.add_argument(
+        '--max-shard-size',
+        type=str,
+        default='2GB',
+        help='Only applicable for LLM. The maximum size for '
+        'each sharded checkpoint.')
     args = parser.parse_args()
     return args
 
diff --git a/xtuner/tools/model_converters/merge_adapter.py b/xtuner/tools/model_converters/merge_adapter.py
deleted file mode 100644
index 7383f23dc..000000000
--- a/xtuner/tools/model_converters/merge_adapter.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os
-
-import torch
-from mmengine.config import Config, DictAction
-
-from xtuner.configs import cfgs_name_path
-from xtuner.registry import BUILDER
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Merge a pth adapter to LLM')
-    parser.add_argument(
-        'config',
-        help='config file name or path. Note: Please use the original '
-        'configs, instead of the automatically saved log configs.')
-    parser.add_argument('adapter_checkpoint', help='adapter checkpoint file')
-    parser.add_argument(
-        'save_dir', help='the directory to save the merged model')
-    parser.add_argument('--max-shard-size', type=str, default='2GB')
-    parser.add_argument(
-        '--is-deepspeed',
-        action='store_true',
-        help='whether the adapter is saved from deepspeed')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. If the value to '
-        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
-        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
-        'Note that the quotation marks are necessary and that no white space '
-        'is allowed.')
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-
-    # parse config
-    if not os.path.isfile(args.config):
-        try:
-            args.config = cfgs_name_path[args.config]
-        except KeyError:
-            raise FileNotFoundError(f'Cannot find {args.config}')
-
-    # load config
-    cfg = Config.fromfile(args.config)
-    if args.cfg_options is not None:
-        cfg.merge_from_dict(args.cfg_options)
-
-    # load on cpu, with non-quantized
-    cfg.model.llm.device_map = 'cpu'
-    cfg.model.llm.quantization_config = None
-    cfg.model.llm.low_cpu_mem_usage = True
-    torch_dtype = cfg.model.llm.get('torch_dtype', torch.float16)
-    model = BUILDER.build(cfg.model)
-    tokenizer = BUILDER.build(cfg.tokenizer)
-    adapter_checkpoint = torch.load(
-        args.adapter_checkpoint, map_location='cpu')
-    state_dict_key = 'module' if args.is_deepspeed else 'state_dict'
-    model.load_state_dict(adapter_checkpoint[state_dict_key], strict=False)
-    print(f'Load adapter from {args.adapter_checkpoint}')
-
-    model = model.llm
-    model_merged = model.merge_and_unload()
-    for param in model.parameters():
-        param.data = param.data.to(torch_dtype)
-    model_merged.save_pretrained(
-        args.save_dir, max_shard_size=args.max_shard_size)
-    tokenizer.save_pretrained(args.save_dir)
-    print(f'Save to {args.save_dir}')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/xtuner/tools/model_converters/pth_to_hf.py b/xtuner/tools/model_converters/pth_to_hf.py
new file mode 100644
index 000000000..ccf15861b
--- /dev/null
+++ b/xtuner/tools/model_converters/pth_to_hf.py
@@ -0,0 +1,108 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import shutil
+
+import torch
+from mmengine.config import Config, DictAction
+
+from xtuner.configs import cfgs_name_path
+from xtuner.registry import BUILDER
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert the pth model to HuggingFace model')
+    parser.add_argument(
+        'config',
+        help='config file name or path. Note: Please use the original '
+        'configs, instead of the automatically saved log configs.')
+    parser.add_argument('pth_model', help='pth model file')
+    parser.add_argument(
+        'save_dir', help='the directory to save HuggingFace model')
+    parser.add_argument(
+        '--fp32',
+        action='store_true',
+        help='Save as fp32. If not set, fp16 will be used by default.')
+    parser.add_argument(
+        '--max-shard-size',
+        type=str,
+        default='2GB',
+        help='Only applicable for LLM. The maximum size for '
+        'each sharded checkpoint.')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def guess_load_checkpoint(pth_model):
+    if os.path.isfile(pth_model):
+        state_dict = torch.load(pth_model, map_location='cpu')
+        if 'state_dict' in state_dict:
+            state_dict = state_dict['state_dict']
+    elif os.path.isdir(pth_model):
+        try:
+            from deepspeed.utils.zero_to_fp32 import \
+                get_fp32_state_dict_from_zero_checkpoint
+        except ImportError:
+            raise ImportError(
+                'The provided PTH model appears to be a DeepSpeed checkpoint. '
+                'However, DeepSpeed library is not detected in current '
+                'environment. This suggests that DeepSpeed may not be '
+                'installed or is incorrectly configured. Please verify your '
+                'setup.')
+        state_dict = get_fp32_state_dict_from_zero_checkpoint(
+            os.path.dirname(pth_model), os.path.basename(pth_model))
+    else:
+        raise FileNotFoundError(f'Cannot find {pth_model}')
+    return state_dict
+
+
+def main():
+    args = parse_args()
+
+    # parse config
+    if not os.path.isfile(args.config):
+        try:
+            args.config = cfgs_name_path[args.config]
+        except KeyError:
+            raise FileNotFoundError(f'Cannot find {args.config}')
+
+    # load config
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    model = BUILDER.build(cfg.model)
+
+    state_dict = guess_load_checkpoint(args.pth_model)
+    model.load_state_dict(state_dict, strict=False)
+    print(f'Load PTH model from {args.pth_model}')
+
+    if not args.fp32:
+        print('Convert weights to float16')
+        model.llm.half()
+
+    print(f'Saving HuggingFace model to {args.save_dir}')
+    model.llm.save_pretrained(
+        args.save_dir, max_shard_size=args.max_shard_size)
+    if 'PeftModel' not in model.llm.__class__.__name__:
+        print(f'Saving HuggingFace tokenizer to {args.save_dir}')
+        tokenizer = BUILDER.build(cfg.tokenizer)
+        tokenizer.save_pretrained(args.save_dir)
+    shutil.copyfile(args.config, os.path.join(args.save_dir,
+                                              'xtuner_config.py'))
+    print('All done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/xtuner/tools/model_converters/split_hf_llm.py b/xtuner/tools/model_converters/split.py
similarity index 100%
rename from xtuner/tools/model_converters/split_hf_llm.py
rename to xtuner/tools/model_converters/split.py