From d075d9961f611129a396a6c11d212c917b371103 Mon Sep 17 00:00:00 2001
From: Xianyan Jia <SeaOfOcean@users.noreply.github.com>
Date: Wed, 28 Aug 2024 19:08:46 +0800
Subject: [PATCH] refine docs and fix link (#38)

* refine docs and fix link
---
 README_CN.md                                  |  4 +--
 docs/en/chatlearn.md                          |  6 ++--
 docs/en/conf.py                               | 32 +++++++++--------
 docs/en/index.rst                             | 17 +---------
 docs/en/programming/vllm.md                   | 12 +++----
 docs/en/tutorial/ems.md                       |  2 +-
 docs/en/tutorial/run.md                       |  3 --
 docs/en/tutorial/tutorial_llama2.md           | 26 +++++++-------
 docs/zh/chatlearn.md                          |  6 ++--
 docs/zh/conf.py                               | 30 +++++++++-------
 docs/zh/index.rst                             | 17 ++--------
 docs/zh/programming/vllm.md                   | 12 +++----
 docs/zh/tutorial/ems.md                       |  2 +-
 docs/zh/tutorial/run.md                       |  2 --
 docs/zh/tutorial/tutorial_llama2.md           | 34 +++++++++----------
 .../scripts/convert_hf_to_megatron.sh         |  2 +-
 .../megatron/scripts/train_reward_llama.sh    | 10 +++---
 examples/megatron/scripts/train_sft_llama.sh  | 10 +++---
 18 files changed, 100 insertions(+), 127 deletions(-)
diff --git a/README_CN.md b/README_CN.md
index 3a9bbd7e..2510f87a 100644
--- a/README_CN.md
+++ b/README_CN.md
@@ -1,5 +1,5 @@
 
-[![docs](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://chatlearn.readthedocs.io/zh/latest/)
+[![docs](https://img.shields.io/badge/docs-latest-brightgreen.svg)](https://chatlearn.readthedocs.io/zh-cn/latest/)
 [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://github.com/alibaba/ChatLearn/blob/main/LICENSE)
 
 <p align="center">
@@ -35,7 +35,7 @@ ChatLearn的特点如下:
 
 # 快速开始
 
-请参考 [文档](https://chatlearn.readthedocs.io/zh/latest/) 快速开始.
+请参考 [文档](https://chatlearn.readthedocs.io/zh-cn/latest/) 快速开始.
 
 1. [环境和代码准备](docs/zh/installation.md)
 2. [基于 LLaMA/LLaMA2 模型的端到端训练教程](docs/zh/tutorial/tutorial_llama2.md)
diff --git a/docs/en/chatlearn.md b/docs/en/chatlearn.md
index 0a8b71fb..8803f502 100644
--- a/docs/en/chatlearn.md
+++ b/docs/en/chatlearn.md
@@ -1,6 +1,6 @@
-# ChatLearn
+# ChatLearn: A flexible and efficient training framework for large-scale alignment
 
-ChatLearn is an efficient training framework that supports large-scale alignment. It aims to provide a flexible and user-friendly platform for alignment training based on Large Language Models (LLMs) such as ChatGPT.
+ChatLearn aims to provide a flexible and user-friendly platform for alignment training based on Large Language Models (LLMs) such as ChatGPT.
 
 ## Introduction
 
@@ -42,7 +42,7 @@ By providing a comprehensive and efficient framework, ChatLearn empowers researc
 
 ## Quick Start
 
-Please refer to the [Documentation](https://chatlearn.readthedocs.io/zh/latest/) for a quick start guide.
+Please refer to the [Documentation](https://chatlearn.readthedocs.io/en/latest/) for a quick start guide.
 
 1. [Environment and Code Setup](installation.md) 
 2. [End-to-End Training Tutorial with Llama/Llama2 Model](tutorial/tutorial_llama2.md)
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 6b0fd5b4..722b1a98 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -25,21 +25,23 @@
 sys.path.insert(0, os.path.abspath("../../"))
 
 from unittest import mock
-
-# 使用unittest.mock来mock模块
-imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
-        'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
-        'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
-        "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
-        'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
-        'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
-        'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
-
-for key in imports:
-    sys.modules[key] = mock.MagicMock()
-
-import chatlearn
-from chatlearn.utils import arguments
+try:
+    import chatlearn
+    from chatlearn.utils import arguments
+except ImportError:
+    imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
+            'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
+            'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
+            "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
+            'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
+            'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
+            'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
+    
+    for key in imports:
+        sys.modules[key] = mock.MagicMock()
+    
+    import chatlearn
+    from chatlearn.utils import arguments
 
 from importlib.machinery import SourceFileLoader
 version = SourceFileLoader("chatlearn.version", "../../chatlearn/utils/version.py") \
diff --git a/docs/en/index.rst b/docs/en/index.rst
index d75ff869..bf0721a1 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -4,12 +4,10 @@ ChatLearn Documentation
 
 .. toctree::
    :maxdepth: 1
-   :caption: ChatLearn: An Efficient Training Framework for Large-Scale Alignment
+   :caption: Introduction
 
    chatlearn
 
-|
-|
 
 .. toctree::
    :maxdepth: 1
@@ -17,9 +15,6 @@ ChatLearn Documentation
 
    installation
 
-|
-|
-
 .. toctree::
    :maxdepth: 1
    :caption: Tutorial
@@ -34,10 +29,6 @@ ChatLearn Documentation
    tutorial/ems
    tutorial/profile
 
-|
-|
-
-
 .. toctree::
    :maxdepth: 1
    :caption: Programming
@@ -55,14 +46,8 @@ ChatLearn Documentation
 
    api/index
 
-|
-|
-
 .. toctree::
    :maxdepth: 1
    :caption: FAQ
 
    faq
-
-|
-|
\ No newline at end of file
diff --git a/docs/en/programming/vllm.md b/docs/en/programming/vllm.md
index 20aa941d..bf1285ea 100644
--- a/docs/en/programming/vllm.md
+++ b/docs/en/programming/vllm.md
@@ -6,7 +6,7 @@ For now, we enable vLLM to accelerate policy generation.
 
 ## Model Definition
 
-Similar to inheriting `MegatronModule` for implementing [PolicyInference Model](../../../examples/megatron/models/old_policy_inference.py), the vLLM backend can be enabled by inheriting `VLLMModule` class and implementing the following key modules:
+Similar to inheriting `MegatronModule` for implementing [PolicyInference Model](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/old_policy_inference.py), the vLLM backend can be enabled by inheriting `VLLMModule` class and implementing the following key modules:
 - model_provider: model definition function.
 - setup: call model_provider to define model. Optionly, call `load_checkpoint` or others.
 - build_dataset: Preprocess train/eval dataset with vLLM tokenizer.
@@ -48,9 +48,9 @@ class VLLMPolicyInference(VLLMModule):
         pass
 ```
 
-You can refer to[vllm_policy_inference.py](../../../examples/megatron/models/vllm_policy_inference.py), in which build_dataset/_add_request/forward_step/decode_internal clarified as following:
+You can refer to[vllm_policy_inference.py](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/vllm_policy_inference.py), in which build_dataset/_add_request/forward_step/decode_internal clarified as following:
 
-- build_dataset: Use `tokenizer`, you only need to return prompt_ids and prompt string. In `build_dataset`, [VLLMPromptPipeline](../../../examples/megatron/data/prompt_dataset.py#141) shows as following:
+- build_dataset: Use `tokenizer`, you only need to return prompt_ids and prompt string. In `build_dataset`, [VLLMPromptPipeline](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/data/prompt_dataset.py#141) shows as following:
 ```python
 class VLLMPromptPipeline(PromptPipeline):
     def __init__(self, prompts: List[str], max_prompt_length: int, tokenizer=None):
@@ -108,7 +108,7 @@ class VLLMPolicyInference(VLLMModule):
         return self._forward_step(data, iteration, eval_mode=False)
 ```
 
-- decode_internal: Refer to [examples](../../../examples/megatron/models/vllm_policy_inference.py#L119) for more details. Format of param `batched_outputs` is List[RequestOutput], in which [RequestOutput](https://github.com/vllm-project/vllm/blob/v0.5.1/vllm/outputs.py#L67)includes the following key attributes:
+- decode_internal: Refer to [examples](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/vllm_policy_inference.py#L119) for more details. Format of param `batched_outputs` is List[RequestOutput], in which [RequestOutput](https://github.com/vllm-project/vllm/blob/v0.5.1/vllm/outputs.py#L67)includes the following key attributes:
 
 |   Attibute  |Type| Comment  |
 |:------:|:-----:|:-----:|
@@ -140,7 +140,7 @@ policy:
     ...
 ```
 
-Or you can refer to [llama2 model yaml](../../../examples/megatron/configs/llama2/vllm_rlhf.yaml).
+Or you can refer to [llama2 model yaml](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/vllm_rlhf.yaml).
 
 ## hyperparameter configuration yaml
 
@@ -186,4 +186,4 @@ Hyperparameter for vLLM can be divied into 5 parts:
 - Others: `includes` specifies model structure.
 
 
-You can refer to [vLLM Hyperparameter Configuration](../../../examples/megatron/configs/llama2/vllm_policy_inference.yaml) for details.
+You can refer to [vLLM Hyperparameter Configuration](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/vllm_policy_inference.yaml) for details.
diff --git a/docs/en/tutorial/ems.md b/docs/en/tutorial/ems.md
index 7deeff21..da7b8d9c 100644
--- a/docs/en/tutorial/ems.md
+++ b/docs/en/tutorial/ems.md
@@ -26,4 +26,4 @@ Alternatively, it can also be configured in the training script using environmen
 - PPO policy model: `export free_memory_ppo_policy=True`
 - PPO value model: `export free_memory_ppo_value=True`
 
-A complete example can be found in the [llama2 configuration](../../../examples/megatron/configs/llama2/rlhf.yaml).
\ No newline at end of file
+A complete example can be found in the [llama2 configuration](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/rlhf.yaml).
\ No newline at end of file
diff --git a/docs/en/tutorial/run.md b/docs/en/tutorial/run.md
index c6f33bc2..ccb9ba5b 100644
--- a/docs/en/tutorial/run.md
+++ b/docs/en/tutorial/run.md
@@ -15,9 +15,6 @@ Select the job type as `PyTorch` and paste the command into the `Execution Comma
 
 
 
-For RLHF, DPO, OnlineDPO, GRPO training task, you need set the advanced setting as `customPortList=30000-30050,createSvcForAllWorkers=true`.
-
-
 ## Non-PAI-DLC environment
 
 If you want to submit distributed training in a non-PAI-DLC environment,
diff --git a/docs/en/tutorial/tutorial_llama2.md b/docs/en/tutorial/tutorial_llama2.md
index 8ff6c262..766735f9 100644
--- a/docs/en/tutorial/tutorial_llama2.md
+++ b/docs/en/tutorial/tutorial_llama2.md
@@ -1,6 +1,6 @@
 # End-to-end Training Tutorial with Llama Model
 
-This document provides instructions for end-to-end training using the ChatLearn, Megatron-LM framework, and the Llama/Llama2 model. ChatLearn supports three training policies as follows:
+This document provides instructions for end-to-end training using the ChatLearn, Megatron-LM and vLLM framework, and the Llama/Llama2 model. ChatLearn supports three training policies as follows:
 1. RLHF(Reinforcement Learning from Human Feedback): which includes three stages of training: SFT, Reward, and RLHF training.
 2. Direct Preference Optimization(DPO): which includes two stages of training: SFT and DPO training.
 3. OnlineDPO/GRPO: which fall in between RLHF and DPO, includes three stages of training: SFT, Reward, and DPO training.
@@ -59,14 +59,15 @@ bash scripts/convert_hf_to_megatron.sh
 ### Start SFT Training
 
 The script below is an example of SFT training. The `DATASET_PATH` is the path to the SFT training set, such as `$DATASET_ROOT/sft/train.jsonl`. 
-The `MODEL_SIZE` is an environment variable specified in the script to indicate the size of the model, which can be `llama2-7B`, `llama2-13B`, or `llama2-70B`.
+The `model_size` is an environment variable specified in the script to indicate the size of the model, which can be `llama2-7B`, `llama2-13B`, or `llama2-70B`.
 
 ```bash
 export CHATLEARN=path-to-chatlearn
 export MEGATRON=path-to-megatron-lm
 cd ${CHATLEARN}/examples/megatron/
 
-MODEL_SIZE=$MODEL_SIZE \
+export model_size=llama2-7B
+
 LOAD_PATH=$MEGATRON_LLAMA2_CKPT_PATH \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 DATASET_PATH=$DATASET_ROOT/sft/ \
@@ -74,9 +75,9 @@ bash scripts/train_sft_llama.sh
 ```
 
 The training logs and the completed models will be stored in `${CHATLEARN}/output/sft` by default.
-For specific definitions, please refer to the script `${CHATLEARN}/2024-08-21/rlhf/examples/megatron/scripts/train_sft_llama.sh`.
+For specific definitions, please refer to the script `${CHATLEARN}/examples/megatron/scripts/train_sft_llama.sh`.
 
-In our training script, the resource requirements (assuming the resources are A100-80GB/A800-80GB/H800-80GB GPUs) are as follows:
+In our training script, the resource requirements (assuming the resources are A100-80GB/A800-80GB GPUs) are as follows:
 1. llama2-7B SFT: 8 GPUs
 2. llama2-13B SFT: 8 GPUs
 3. llama2-70B SFT: 4*8 GPUs
@@ -97,7 +98,7 @@ Based on InstructGPT[1], the Reward model training is initialized with the SFT m
 
 ```bash
 export CHATLEARN=path-to-chatlearn
-export MEGATRON=path-to-megatron-lm-extension
+export MEGATRON=path-to-megatron-lm
 cd ${CHATLEARN}/examples/megatron/
 
 LOAD_PATH=path-to-sft-ckpt \
@@ -128,7 +129,7 @@ In this example, the user needs to set `POLICY_LOAD` to the checkpoint path gene
 The Policy and Reference models will be initialized with the SFT checkpoint.
 `REWARD_LOAD` should be set to the checkpoint path generated by the Reward training, and the user can specify the iteration number for the loaded checkpoint.
 The Reward and Value models will be initialized with the weights of the Reward model.
-`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for LlamaTokenizer is located.
+`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for Llama2Tokenizer is located.
 
 ```bash
 export CHATLEARN=path-to-chatlearn
@@ -140,7 +141,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash run_scripts/train_rlhf_llama.sh
@@ -160,7 +161,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_online_dpo_llama.sh
@@ -172,7 +173,7 @@ bash scripts/train_online_dpo_llama.sh
 Here is a training script for Llama2-7B Policy and 7B Reward models.
 In this example, the user needs to set `POLICY_LOAD` to the checkpoint path generated by SFT.
 The Policy and Reference models will be initialized with the SFT checkpoint.
-`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for LlamaTokenizer is located.
+`TOKENIZER_MODEL` should be set to the folder path where the `tokenizer.model` for Llama2Tokenizer is located.
 
 ```bash
 export CHATLEARN=path-to-chatlearn
@@ -201,7 +202,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_grpo_math_llama.sh
@@ -212,7 +213,7 @@ bash scripts/train_grpo_math_llama.sh
 If you need to train a llama2-13B / llama2-70B model, simply change `export model_size=llama2-7B` with `export model_size=llama2-13B` / `export model_size=llama2-70B`.
 You can also modify the model configuration and other parameters according to your needs.
 
-In our training script, the resource requirements (assuming the resources are A100-80GB / A800-80GB / H800-80GB GPUs) are as follows:
+In our training script, the resource requirements (assuming the resources are A100-80GB / A800-80GB GPUs) are as follows:
 
 1. llama2-7B RLHF: 8 GPUs
 2. llama2-13B RLHF: 2*8 GPUs
@@ -220,7 +221,6 @@ In our training script, the resource requirements (assuming the resources are A1
 
 For the environment variables and configurations required for distributed execution, please refer to [Distributed Execution](run.md).
 
-Note that for RLHF tasks, if you are running on PAI DLC, you need to fill in the advanced configuration `customPortList=30000-30050,createSvcForAllWorkers=true`.
 
 ### Evaluation
 
diff --git a/docs/zh/chatlearn.md b/docs/zh/chatlearn.md
index 86fb8994..ef1af744 100644
--- a/docs/zh/chatlearn.md
+++ b/docs/zh/chatlearn.md
@@ -1,6 +1,4 @@
-# ChatLearn
-
-ChatLearn 是一个灵活、易用、高效的大规模 Alignment 训练框架。
+# ChatLearn: 灵活、易用、高效的大规模 Alignmant 训练框架
 
 ## 概述
 
@@ -38,7 +36,7 @@ ChatGPT 是由 OpenAI 开发的基于大型语言模型 (Large Language Model, L
 
 ## 快速开始
 
-请参考 [文档](https://chatlearn.readthedocs.io/zh/latest/) 快速开始.
+请参考 [文档](https://chatlearn.readthedocs.io/zh-cn/latest/) 快速开始.
 
 1. [环境和代码准备](installation.md)
 2. [基于 Llama/Llama2 模型的端到端训练教程](tutorial/tutorial_llama2.md)
diff --git a/docs/zh/conf.py b/docs/zh/conf.py
index df7be765..99980359 100644
--- a/docs/zh/conf.py
+++ b/docs/zh/conf.py
@@ -27,19 +27,23 @@
 from unittest import mock
 
 # 使用unittest.mock来mock模块
-imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
-        'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
-        'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
-        "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
-        'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
-        'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
-        'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
-
-for key in imports:
-    sys.modules[key] = mock.MagicMock()
-
-import chatlearn
-from chatlearn.utils import arguments
+try:
+    import chatlearn
+    from chatlearn.utils import arguments
+except ImportError:
+    imports = ['torch', 'cupy.cuda', 'pynvml', 'ray', 'ray.util', 'ray.util.collective.collective_group.nccl_util',
+            'ray.util.scheduling_strategies', 'pynvml', 'ray.util.state', 'ray._private', 'ray._private.utils',
+            'ray._private.ray_logging', 'ray._private.worker', 'ray.util.collective', 'ray.util.collective.collective_group',
+            "ray.util.collective.collective_group.base_collective_group", 'ray.util.collective.collective_group.nccl_collective_group',
+            'torch.utils.data', 'torch._utils', 'transformers', 'transformers.integrations', 'transformers.trainer', 'deepspeed',
+            'deepspeed.ops.adam', 'deepspeed.runtime.zero.partition_parameters', 'torch.distributed', 'torch.nn', 'torch.nn.utils.rnn', 'ray.util.queue',
+            'ray.experimental.state.api', 'torch.cuda', 'ray.util.placement_group', "cupy.cuda.nccl", 'tqdm', 'numpy']
+    
+    for key in imports:
+        sys.modules[key] = mock.MagicMock()
+    
+    import chatlearn
+    from chatlearn.utils import arguments
 
 from importlib.machinery import SourceFileLoader
 version = SourceFileLoader("chatlearn.version", "../../chatlearn/utils/version.py") \
diff --git a/docs/zh/index.rst b/docs/zh/index.rst
index d8d20d2b..a1e2c699 100644
--- a/docs/zh/index.rst
+++ b/docs/zh/index.rst
@@ -1,15 +1,13 @@
-ChatLearn Documentation
+ChatLearn 使用文档
 =======================
 
 
 .. toctree::
    :maxdepth: 1
-   :caption: ChatLearn: 大规模 Alignment 高效训练框架
+   :caption: 简介
 
    chatlearn
 
-|
-|
 
 .. toctree::
    :maxdepth: 1
@@ -17,8 +15,6 @@ ChatLearn Documentation
 
    installation
 
-|
-|
 
 .. toctree::
    :maxdepth: 1
@@ -34,8 +30,6 @@ ChatLearn Documentation
    tutorial/ems
    tutorial/profile
 
-|
-|
 
 .. toctree::
    :maxdepth: 1
@@ -54,14 +48,9 @@ ChatLearn Documentation
 
    api/index
 
-|
-|
 
 .. toctree::
    :maxdepth: 1
    :caption: 常见问题
 
-   faq
-
-|
-|
+   faq
\ No newline at end of file
diff --git a/docs/zh/programming/vllm.md b/docs/zh/programming/vllm.md
index 22548ee0..b2fec2b2 100644
--- a/docs/zh/programming/vllm.md
+++ b/docs/zh/programming/vllm.md
@@ -6,7 +6,7 @@ ChatLearn中支持vLLM进行跨机分布式推理，支持vllm和training backen
 
 ## 模型定义
 
-类似于继承`MegatronModule`实现[PolicyInference模型](../../../examples/megatron/models/old_policy_inference.py),PolicyInference模型若想基于vLLM后端完成generation，需要继承`VLLMModule`父类，实现以下关键模块：
+类似于继承`MegatronModule`实现[PolicyInference模型](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/old_policy_inference.py),PolicyInference模型若想基于vLLM后端完成generation，需要继承`VLLMModule`父类，实现以下关键模块：
 - model_provider：模型定义函数。
 - setup：调用model_provider定义模型，可根据需要决定是否load_checkpoint等。
 - build_dataset：调用vLLM tokenizer处理数据，生成prompt dataset。
@@ -48,9 +48,9 @@ class VLLMPolicyInference(VLLMModule):
         pass
 ```
 
-示例可参考[vllm_policy_inference.py](../../../examples/megatron/models/vllm_policy_inference.py)，补充说明build_dataset、_add_request、forward_step、decode_internal如下：
+示例可参考[vllm_policy_inference.py](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/vllm_policy_inference.py)，补充说明build_dataset、_add_request、forward_step、decode_internal如下：
 
-- build_dataset：调用tokenizer处理只需要返回prompt_ids、prompt str，其中build_dataset的[VLLMPromptPipeline](../../../examples/megatron/data/prompt_dataset.py#141)具体逻辑如下：
+- build_dataset：调用tokenizer处理只需要返回prompt_ids、prompt str，其中build_dataset的[VLLMPromptPipeline](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/data/prompt_dataset.py#141)具体逻辑如下：
 ```python
 class VLLMPromptPipeline(PromptPipeline):
     def __init__(self, prompts: List[str], max_prompt_length: int, tokenizer=None):
@@ -108,7 +108,7 @@ class VLLMPolicyInference(VLLMModule):
         return self._forward_step(data, iteration, eval_mode=False)
 ```
 
-- decode_internal：可参考[examples](../../../examples/megatron/models/vllm_policy_inference.py#L119)实现。参数batched_outputs格式为List[RequestOutput]，其中[RequestOutput](https://github.com/vllm-project/vllm/blob/v0.5.1/vllm/outputs.py#L67)包含以下重要attributes：
+- decode_internal：可参考[examples](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/models/vllm_policy_inference.py#L119)实现。参数batched_outputs格式为List[RequestOutput]，其中[RequestOutput](https://github.com/vllm-project/vllm/blob/v0.5.1/vllm/outputs.py#L67)包含以下重要attributes：
 
 |   属性  |类型| 含义  |
 |:------:|:-----:|:-----:|
@@ -138,7 +138,7 @@ policy:
     model_config_file: vllm_policy_inference.yaml
     ...
 ```
-也可以参考示例 [llama2模型配置](../../../examples/megatron/configs/llama2/vllm_rlhf.yaml)。
+也可以参考示例 [llama2模型配置](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/vllm_rlhf.yaml)。
 
 ## 超参配置
 
@@ -182,4 +182,4 @@ vLLM超参可分为五部分：
 - tokenizer：vLLM tokenizer读取目录，可参考[LLama2-7B-hf](https://huggingface.co/meta-llama/Llama-2-7b)
 - 其他：includes指定模型结构等其余参数；
 
-可以参考 [vLLM超参配置](../../../examples/megatron/configs/llama2/vllm_policy_inference.yaml)。
+可以参考 [vLLM超参配置](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/vllm_policy_inference.yaml)。
diff --git a/docs/zh/tutorial/ems.md b/docs/zh/tutorial/ems.md
index dab68054..4cd552b8 100644
--- a/docs/zh/tutorial/ems.md
+++ b/docs/zh/tutorial/ems.md
@@ -29,4 +29,4 @@ policy:
 - ppo_policy 模型：`export free_memory_ppo_policy=True`
 - ppo_value 模型：`export free_memory_ppo_value=True`
 
-完整示例可以参考 [llama2 配置](../../../examples/megatron/configs/llama2/rlhf.yaml)。
+完整示例可以参考 [llama2 配置](https://github.com/alibaba/ChatLearn/blob/main/examples/megatron/configs/llama2/rlhf.yaml)。
diff --git a/docs/zh/tutorial/run.md b/docs/zh/tutorial/run.md
index 05c3da87..776bd917 100644
--- a/docs/zh/tutorial/run.md
+++ b/docs/zh/tutorial/run.md
@@ -13,8 +13,6 @@
 ![image.png](../../images/dlc_2.jpg)
 
 
-**对于 RLHF/DPO/OnlineDPO/GRPO 训练任务，您需要填写高级配置`customPortList=30000-30050,createSvcForAllWorkers=true`。**
-
 
 ## 其他环境分布式执行
 
diff --git a/docs/zh/tutorial/tutorial_llama2.md b/docs/zh/tutorial/tutorial_llama2.md
index d71f43de..bfb77c0e 100644
--- a/docs/zh/tutorial/tutorial_llama2.md
+++ b/docs/zh/tutorial/tutorial_llama2.md
@@ -1,6 +1,6 @@
 # 基于 Llama 模型的端到端训练教程
 
-本文档介绍基于 ChatLearn, Megatron-LM 框架和 Llama/Llama2 模型进行 alignment 的训练流程。支持RLHF、DPO、OnlineDPO、GRPO 多种训练模式：
+本文档介绍基于 ChatLearn, Megatron-LM 和 vLLM 框架和 Llama/Llama2 模型进行 alignment 的训练流程。支持RLHF、DPO、OnlineDPO、GRPO 多种训练模式：
 1. RLHF(Reinforcement Learning from Human Feedback)：包括三阶段的训练（SFT, Reward 和 RLHF 训练）;
 2. DPO(Direct Preference Optimization)：包括两阶段的训练（SFT 和 DPO 训练）;
 3. OnlineDPO/GRPO：介于 DPO 和 RLHF 之间，使用 Policy + Reward 模型来自动生成数据并进行打分，再进行DPO训练，包括三阶段的训练（SFT, Reward 和 DPO 训练）.
@@ -44,7 +44,7 @@ SFT 指的是使用有标注的对话数据来微调预训练语言模型的过
 export MEGATRON=path-to-megatron-lm
 export CHATLEARN=path-to-chatlearn
 
-cd ${CHATLEARN}/examples/megatron/sft/
+cd ${CHATLEARN}/examples/megatron/
 
 TP=num_of_tp \
 PP=num_of_pp \
@@ -58,23 +58,24 @@ bash scripts/convert_hf_to_megatron.sh
 ### 开启 SFT 训练
 
 下面的脚本是一个 SFT 的训练样例。其中 `DATASET_PATH` 为 SFT 训练集路径，比如`$DATASET_ROOT/sft/train.jsonl`。
-其中 `MODEL_SIZE` 为脚本中指定模型大小的环境变量，可以为 `llama2-7B`/`llama2-13B`/`llama2-70B`。
+其中 `model_size` 为脚本中指定模型大小的环境变量，可以为 `llama2-7B`/`llama2-13B`/`llama2-70B`。
 
 ```bash
 export CHATLEARN=path-to-chatlearn
 export MEGATRON=path-to-megatron-lm
-cd ${CHATLEARN}/examples/megatron/sft/
+cd ${CHATLEARN}/examples/megatron/
+
+export model_size=llama2-7B
 
-MODEL_SIZE=$MODEL_SIZE \
 LOAD_PATH=$MEGATRON_LLAMA2_CKPT_PATH \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 DATASET_PATH=$DATASET_ROOT/sft/ \
-bash scripts/llama2_sft.sh
+bash scripts/train_sft_llama.sh
 ```
 
-训练 log 和训练完成的模型默认会存放在`${CHATLEARN}/output/sft`中，可以通过 CHECKPOINT_PATH 来指定模型保存路径，具体的定义详见`${CHATLEARN}/examples/megatron/sft/scripts/llama2_sft.sh`脚本。
+训练 log 和训练完成的模型默认会存放在`${CHATLEARN}/output/sft`中，可以通过 CHECKPOINT_PATH 来指定模型保存路径，具体的定义详见`${CHATLEARN}/examples/megatron/scripts/train_sft_llama.sh`脚本。
 
-在我们的训练脚本里，资源需求 (假设资源为 A100-80GB/A800-80GB/H800-80GB GPU) 如下：
+在我们的训练脚本里，资源需求 (假设资源为 A100-80GB/A800-80GB GPU) 如下：
 1. llama2-7B SFT: 8 GPU
 2. llama2-13B SFT: 8 GPU
 3. llama2-70B SFT: 4*8 GPU
@@ -93,7 +94,7 @@ Reward 模型指的是在 RLHF 中作为人类评价的代理，对模型产生
 
 ```bash
 export CHATLEARN=path-to-chatlearn
-export MEGATRON=path-to-megatron-lm-extension
+export MEGATRON=path-to-megatron-lm
 cd ${CHATLEARN}/examples/megatron/
 
 LOAD_PATH=path-to-sft-ckpt \
@@ -122,7 +123,7 @@ ChatLearn 支持多种 Alignment 训练模式：RLHF、DPO、OnlineDPO、GRP、G
 以下是一个 Llama2-7B 的 Policy 和 7B 的 Reward 模型的训练脚本。
 在这个例子中，用户需要设置 `POLICY_LOAD` 为 SFT 产出的 checkpoint 路径，Policy 模型和 Reference 模型将以 SFT 的 checkpoint 初始化。
 `REWARD_LOAD` 为 Reward 训练产出的 checkpoint 路径，同时，用户可以指定 load checkpoint 对应的 iteration 数。
-Reward 模型和 Value 模型将以 Reward 模型的权重作初始化。`TOKENIZER_MODEL` 为 `LlamaTokenizer` 所需文件 `tokenizer.model` 所在的文件夹路径。
+Reward 模型和 Value 模型将以 Reward 模型的权重作初始化。`TOKENIZER_MODEL` 为 `Llama2Tokenizer` 所需文件 `tokenizer.model` 所在的文件夹路径。
 
 ```bash
 export CHATLEARN=path-to-chatlearn
@@ -134,7 +135,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_rlhf_llama.sh
@@ -154,7 +155,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_online_dpo_llama.sh
@@ -163,7 +164,7 @@ bash scripts/train_online_dpo_llama.sh
 #### DPO
 以下是一个 Llama2-7B 的 Policy模型的训练脚本。
 在这个例子中，用户需要设置 `POLICY_LOAD` 为 SFT 产出的 checkpoint 路径，Policy 模型和 Reference 模型将以 SFT 的 checkpoint 初始化。
-`TOKENIZER_MODEL` 为 `LlamaTokenizer` 所需文件 `tokenizer.model` 所在的文件夹路径。
+`TOKENIZER_MODEL` 为 `Llama2Tokenizer` 所需文件 `tokenizer.model` 所在的文件夹路径。
 
 ```bash
 export CHATLEARN=path-to-chatlearn
@@ -193,7 +194,7 @@ cd ${CHATLEARN}/examples/megatron/
 export model_size=llama2-7B
 
 POLICY_LOAD=path-to-sft-ckpt \
-REWARD_LOAD=path-to-trained-rm-checkpoint \
+REWARD_LOAD=path-to-rm-ckpt \
 REWARD_LOAD_ITERATION=1000 \
 TOKENIZER_MODEL=$LLAMA2_TOKENIZER_MODEL \
 bash scripts/train_grpo_math_llama.sh
@@ -205,13 +206,12 @@ bash scripts/train_grpo_math_llama.sh
 如果您需要训练 llama2-13B / llama2-70B 的模型，只需要将上述训练脚本中的 `export model_size=llama2-7B` 替换成 `export model_size=llama2-13B` / `export model_size=llama2-70B`。
 您也可以根据自己的需求修改模型配置和其他参数。
 
-在我们的训练脚本里，资源需求 (假设资源为 A100-80GB/A800-80GB/H800-80GB GPU) 如下：
+在我们的训练脚本里，资源需求 (假设资源为 A100-80GB/A800-80GB GPU) 如下：
 1. llama2-7B RLHF: 8 GPU
 2. llama2-13B RLHF: 2*8 GPU
 3. llama2-70B RLHF: 4*8 GPU
 
 分布式执行所需的环境变量和配置参考 [分布式执行](run.md)。
-**注意对于 RLHF 任务，如果在 PAI DLC 上运行，您需要填写高级配置`customPortList=30000-30050,createSvcForAllWorkers=true`。**
 
 
 ### 效果评估
@@ -222,7 +222,7 @@ bash scripts/train_grpo_math_llama.sh
 export CHATLEARN=path-to-chatlearn
 export MEGATRON=path-to-megatron-lm
 
-cd $CHATLEARN/examples/megatron/alignment
+cd $CHATLEARN/examples/megatron/
 
 LOAD_PATH=path-to-megatron-model \
 SAVE_PATH=path-to-hf-model \
diff --git a/examples/megatron/scripts/convert_hf_to_megatron.sh b/examples/megatron/scripts/convert_hf_to_megatron.sh
index 83b1a240..f02bacfd 100644
--- a/examples/megatron/scripts/convert_hf_to_megatron.sh
+++ b/examples/megatron/scripts/convert_hf_to_megatron.sh
@@ -17,7 +17,7 @@ megatron=${MEGATRON}
 load_dir=${LOAD_PATH}
 save_dir=${SAVE_PATH}
 tokenizer_model=${TOKENIZER_MODEL}
-model_size=${MODEL_SIZE:-llama2-7B}
+model_size=${model_size:-llama2-7B}
 
 export CUDA_DEVICE_MAX_CONNECTIONS=1
 
diff --git a/examples/megatron/scripts/train_reward_llama.sh b/examples/megatron/scripts/train_reward_llama.sh
index 472eb18e..bf3fb8e4 100644
--- a/examples/megatron/scripts/train_reward_llama.sh
+++ b/examples/megatron/scripts/train_reward_llama.sh
@@ -24,23 +24,23 @@ DISTRIBUTED_ARGS="--nproc_per_node ${GPUS_PER_NODE} \
                   --master_addr ${MASTER_ADDR} \
                   --master_port ${MASTER_PORT}"
 
-[ -z "$MODEL_SIZE" ] && export MODEL_SIZE=llama2-7B
+[ -z "$model_size" ] && export model_size=llama2-7B
 
-if [ $MODEL_SIZE = llama2-7B ]; then
+if [ $model_size = llama2-7B ]; then
   NUM_LAYERS=32
   HIDDEN_SIZE=4096
   NUM_ATTN_HEADS=32
   INTERMEDIATE_SIZE=11008
   tp=4
   pp=1
-elif [ $MODEL_SIZE = llama2-13B ]; then
+elif [ $model_size = llama2-13B ]; then
   NUM_LAYERS=40
   HIDDEN_SIZE=5120
   NUM_ATTN_HEADS=40
   INTERMEDIATE_SIZE=13824
   tp=8
   pp=1
-elif [ $MODEL_SIZE = llama2-70B ]; then
+elif [ $model_size = llama2-70B ]; then
   NUM_LAYERS=80
   HIDDEN_SIZE=8192
   NUM_ATTN_HEADS=64
@@ -66,7 +66,7 @@ NNODES=$WORLD_SIZE
 dp=$(($WORLD_SIZE * $GPUS_PER_NODE / $tp / $pp))
 gbs=$(($gbs * $dp))
 
-[ -z "$CHECKPOINT_PATH" ] && CHECKPOINT_PATH=${CHATLEARN}/output/reward/reward_hh_$(date +%F)_gpt_${MODEL_SIZE}_${NNODES}w${GPUS_PER_NODE}g_tp${tp}_pp${pp}_mb${mb}_seqlen${seq_len}
+[ -z "$CHECKPOINT_PATH" ] && CHECKPOINT_PATH=${CHATLEARN}/output/reward/reward_hh_$(date +%F)_gpt_${model_size}_${NNODES}w${GPUS_PER_NODE}g_tp${tp}_pp${pp}_mb${mb}_seqlen${seq_len}
 
 
 MODEL_ARGS="
diff --git a/examples/megatron/scripts/train_sft_llama.sh b/examples/megatron/scripts/train_sft_llama.sh
index 2914d922..97333d6d 100644
--- a/examples/megatron/scripts/train_sft_llama.sh
+++ b/examples/megatron/scripts/train_sft_llama.sh
@@ -23,23 +23,23 @@ DISTRIBUTED_ARGS="--nproc_per_node ${GPUS_PER_NODE} \
 
 export PYTHONPATH=${PYTHONPATH}:${MEGATRON}:${CHATLEARN}/examples/megatron:${CHATLEARN}
 
-[ -z "$MODEL_SIZE" ] && export MODEL_SIZE=llama2-7B
+[ -z "$model_size" ] && export model_size=llama2-7B
 
-if [ $MODEL_SIZE = llama2-7B ]; then
+if [ $model_size = llama2-7B ]; then
   NUM_LAYERS=32
   HIDDEN_SIZE=4096
   NUM_ATTN_HEADS=32
   INTERMEDIATE_SIZE=11008
   tp=4
   pp=1
-elif [ $MODEL_SIZE = llama2-13B ]; then
+elif [ $model_size = llama2-13B ]; then
   NUM_LAYERS=40
   HIDDEN_SIZE=5120
   NUM_ATTN_HEADS=40
   INTERMEDIATE_SIZE=13824
   tp=8
   pp=1
-elif [ $MODEL_SIZE = llama2-70B ]; then
+elif [ $model_size = llama2-70B ]; then
   NUM_LAYERS=80
   HIDDEN_SIZE=8192
   NUM_ATTN_HEADS=64
@@ -67,7 +67,7 @@ dp=$(($WORLD_SIZE * $GPUS_PER_NODE / $tp / $pp))
 gbs=$(($gbs * $dp))
 
 
-[ -z "$CHECKPOINT_PATH" ] && CHECKPOINT_PATH=${CHATLEARN}/output/sft/hh_sft_$(date +%F)_gpt_${MODEL_SIZE}_${NNODES}w${GPUS_PER_NODE}g_tp${tp}_pp${pp}_mb${mb}_seqlen${seq_len}
+[ -z "$CHECKPOINT_PATH" ] && CHECKPOINT_PATH=${CHATLEARN}/output/sft/hh_sft_$(date +%F)_gpt_${model_size}_${NNODES}w${GPUS_PER_NODE}g_tp${tp}_pp${pp}_mb${mb}_seqlen${seq_len}
 
 mkdir -p $CHECKPOINT_PATH