diff --git a/chatlearn/models/vllm_module_v2.py b/chatlearn/models/vllm_module_v2.py index 4c6ccea..4e36cb1 100644 --- a/chatlearn/models/vllm_module_v2.py +++ b/chatlearn/models/vllm_module_v2.py @@ -48,6 +48,7 @@ def __init__(self, *args, **kwargs): if 'worker_module_name' in kwargs and 'worker_class_name' in kwargs: RayWorkerWrapper.__init__(self, **kwargs) # pylint: disable=non-parent-init-called os.environ['VLLM_HOST_IP'] = self.get_address() + self.tokenizer = None self._model = None self.set_vllm_pp_layer_partition() diff --git a/chatlearn/synchronizer/parameter_sync.py b/chatlearn/synchronizer/parameter_sync.py index 135a560..6daa5a8 100644 --- a/chatlearn/synchronizer/parameter_sync.py +++ b/chatlearn/synchronizer/parameter_sync.py @@ -1185,7 +1185,7 @@ def setup_rank_mapping(self): and get_args().runtime_args.routed_expert_regrouping_comm_type == ROUTED_EXPERT_REGROUPING_COMM_TYPE.ALLTOALL ): raise NotImplementedError( - "All-to-all routed expert weight is only supported when src TP size * src EP size = dst TP size. " + "all-to-all routed expert weight is only supported when src TP size * src EP size = dst TP size. " "Please consider setting `routed_expert_regrouping_comm_type` to allgather or adjusting the model's parallel size." )