Tensor parallel distributed strategy without using deepspeed (#321)

* Revert "Tensor parallel distributed strategy without using deepspeed (#280)" This reverts commit c6e5f9c. * Tensor parallel distributed strategy without using deepspeed (huggingface#1121) Co-authored-by: Kalyan <kkumar@habana.ai> --------- Change-Id: Ic30c85e697dbd6a51767e21e1c06c9a20120d9f6 Co-authored-by: Kalyan <kkumar@habana.ai>
HabanaAI · Aug 6, 2024 · e7375cf · e7375cf
1 parent c0606c8
commit e7375cf
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 0 deletions.
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
@@ -287,6 +287,24 @@ def setup_parser(parser):
         action="store_true",
         help="Whether to trust the execution of code from datasets/models defined on the Hub. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.",
     )
+    parser.add_argument(
+<<<<<<< HEAD
+        "--parallel_strategy",
+        type=str,
+        choices=["tp", "none"],  # Add other strategies as needed
+        default="none",
+        help="Run multi card with the specified parallel strategy. Choices are 'tp' for Tensor Parallel Strategy or 'none'.",
+=======
+        "--run_partial_dataset",
+        action="store_true",
+        help="Run the inference with dataset for specified --n_iterations(default:5)",
+    )
+    parser.add_argument(
+        "--load_cp",
+        action="store_true",
+        help="Whether to load model from hugging face checkpoint.",
+>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))
+    )
     parser.add_argument(
         "--parallel_strategy",
         type=str,

diff --git a/optimum/habana/distributed/tensorparallel.py b/optimum/habana/distributed/tensorparallel.py
@@ -14,6 +14,10 @@
 #
 # This file has been modified from its original version.
 # The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
+<<<<<<< HEAD
+=======
+
+>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))
 
 import torch
 import torch._inductor.ir as ir

diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -660,6 +660,7 @@ def pre_attn_forward(
         flash_attention_recompute: Optional[bool] = False,
         flash_attention_causal_mask: Optional[bool] = False,
         flash_attention_fast_softmax: Optional[bool] = False,
+        valid_sequence_lengths: torch.Tensor = None,
         cache_idx: int = None,
         **kwargs,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
@@ -679,6 +680,7 @@ def pre_attn_forward(
             flash_attention_recompute,
             flash_attention_causal_mask,
             flash_attention_fast_softmax,
+            valid_sequence_lengths,
             cache_idx,
             **kwargs,
         )

diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
@@ -153,12 +153,15 @@ def _test_text_generation(
     if "falcon" in model_name.lower() or "starcoder2" in model_name.lower():
         command += ["--use_flash_attention", "--flash_attention_causal_mask"]
 
+<<<<<<< HEAD
     if "starcoder" in model_name.lower() and "starcoder2" not in model_name.lower():
         command += ["--use_flash_attention"]
 
     if "starcoder2" in model_name.lower():
         command += ["--flash_attention_recompute"]
 
+=======
+>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))
     if (reuse_cache or torch_compile) and not parallel_strategy == "tp":
         command += ["--reuse_cache"]
 
@@ -208,6 +211,17 @@ def _test_text_generation(
             f"--max_input_tokens {max_input_tokens}",
             "--limit_hpu_graphs",
         ]
+<<<<<<< HEAD
+    if parallel_strategy is not None:
+        command += [
+            f"--parallel_strategy={parallel_strategy}",
+        ]
+=======
+
+    if gptq:
+        command += ["--gptq"]
+>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))
+
     if parallel_strategy is not None:
         command += [
             f"--parallel_strategy={parallel_strategy}",
@@ -325,6 +339,7 @@ def test_text_generation_distributed_tp(model_name: str, baseline: float, token:
         torch_compile=True,
         parallel_strategy="tp",
     )
+<<<<<<< HEAD
 
 
 class TextGenPipeline(TestCase):
@@ -370,3 +385,5 @@ def test_text_generation_pipeline_falcon(self):
 
         # Ensure the run finished without any issue
         self.assertEqual(return_code, 0)
+=======
+>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))