[Bugs] Fix bugs caused by sequence parallel when deepspeed is not use…

…d. (#752) * fix sp bugs when training wo deepspeed * check deepspeed usage when setting sequence_parallel_size > 1
InternLM · Jun 11, 2024 · a3e11b9 · a3e11b9
1 parent 4910476
commit a3e11b9
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 4 deletions.
diff --git a/xtuner/parallel/sequence/setup_distributed.py b/xtuner/parallel/sequence/setup_distributed.py
@@ -59,7 +59,7 @@ def get_sequence_parallel_world_size():
     global _SEQUENCE_PARALLEL_WORLD_SIZE
     if _SEQUENCE_PARALLEL_WORLD_SIZE is not None:
         return _SEQUENCE_PARALLEL_WORLD_SIZE
-    if not dist.is_initialized():
+    if not dist.is_initialized() or (_SEQUENCE_PARALLEL_GROUP is None):
         _SEQUENCE_PARALLEL_WORLD_SIZE = 1
     else:
         _SEQUENCE_PARALLEL_WORLD_SIZE = dist.get_world_size(
@@ -72,7 +72,7 @@ def get_sequence_parallel_rank():
     global _SEQUENCE_PARALLEL_RANK
     if _SEQUENCE_PARALLEL_RANK is not None:
         return _SEQUENCE_PARALLEL_RANK
-    if not dist.is_initialized():
+    if not dist.is_initialized() or (_SEQUENCE_PARALLEL_GROUP is None):
         _SEQUENCE_PARALLEL_RANK = 0
     else:
         _SEQUENCE_PARALLEL_RANK = dist.get_rank(

diff --git a/xtuner/tools/train.py b/xtuner/tools/train.py
@@ -77,7 +77,7 @@ def register_function(cfg_dict):
             register_function(value)
 
 
-def check_cfg(cfg):
+def check_cfg(cfg, args):
     if getattr(cfg, 'use_varlen_attn',
                False) and cfg.train_dataloader.batch_size > 1:
         raise NotImplementedError(
@@ -116,6 +116,13 @@ def check_cfg(cfg):
                 ' attn_implementation to `flash_attention_2` or do not '
                 f'set this attribute. Got `{attn_implementation}` .')
 
+    if args.deepspeed is None:
+        assert getattr(cfg, 'sequence_parallel_size', 1) == 1, \
+            ('Sequence parallel training without DeepSpeed lacks validation.'
+             'Please use DeepSpeed to optimize the training phase by '
+             '`--deepspeed deepspeed_zero1 (deepspeed_zero2 or '
+             'deepspeed_zero3)`.')
+
 
 def main():
     args = parse_args()
@@ -137,7 +144,7 @@ def main():
     # change these FunctionType object to str
     register_function(cfg._cfg_dict)
 
-    check_cfg(cfg)
+    check_cfg(cfg, args)
 
     if cfg.get('framework', 'mmengine').lower() == 'huggingface':
         # set default training_args