Skip to content

Commit

Permalink
Tensor parallel distributed strategy without using deepspeed (#321)
Browse files Browse the repository at this point in the history
* Revert "Tensor parallel  distributed strategy without using deepspeed (#280)"

This reverts commit c6e5f9c.

* Tensor parallel distributed strategy without using deepspeed (huggingface#1121)

Co-authored-by: Kalyan <kkumar@habana.ai>

---------

Change-Id: Ic30c85e697dbd6a51767e21e1c06c9a20120d9f6
Co-authored-by: Kalyan <kkumar@habana.ai>
  • Loading branch information
2 people authored and astachowiczhabana committed Aug 6, 2024
1 parent c0606c8 commit e7375cf
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 0 deletions.
18 changes: 18 additions & 0 deletions examples/text-generation/run_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,6 +287,24 @@ def setup_parser(parser):
action="store_true",
help="Whether to trust the execution of code from datasets/models defined on the Hub. This option should only be set to `True` for repositories you trust and in which you have read the code, as it will execute code present on the Hub on your local machine.",
)
parser.add_argument(
<<<<<<< HEAD
"--parallel_strategy",
type=str,
choices=["tp", "none"], # Add other strategies as needed
default="none",
help="Run multi card with the specified parallel strategy. Choices are 'tp' for Tensor Parallel Strategy or 'none'.",
=======
"--run_partial_dataset",
action="store_true",
help="Run the inference with dataset for specified --n_iterations(default:5)",
)
parser.add_argument(
"--load_cp",
action="store_true",
help="Whether to load model from hugging face checkpoint.",
>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))
)
parser.add_argument(
"--parallel_strategy",
type=str,
Expand Down
4 changes: 4 additions & 0 deletions optimum/habana/distributed/tensorparallel.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,10 @@
#
# This file has been modified from its original version.
# The original version can be found at https://github.com/foundation-model-stack/foundation-model-stack
<<<<<<< HEAD
=======

>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))

import torch
import torch._inductor.ir as ir
Expand Down
2 changes: 2 additions & 0 deletions optimum/habana/transformers/models/llama/modeling_llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -660,6 +660,7 @@ def pre_attn_forward(
flash_attention_recompute: Optional[bool] = False,
flash_attention_causal_mask: Optional[bool] = False,
flash_attention_fast_softmax: Optional[bool] = False,
valid_sequence_lengths: torch.Tensor = None,
cache_idx: int = None,
**kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
Expand All @@ -679,6 +680,7 @@ def pre_attn_forward(
flash_attention_recompute,
flash_attention_causal_mask,
flash_attention_fast_softmax,
valid_sequence_lengths,
cache_idx,
**kwargs,
)
Expand Down
17 changes: 17 additions & 0 deletions tests/test_text_generation_example.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,12 +153,15 @@ def _test_text_generation(
if "falcon" in model_name.lower() or "starcoder2" in model_name.lower():
command += ["--use_flash_attention", "--flash_attention_causal_mask"]

<<<<<<< HEAD
if "starcoder" in model_name.lower() and "starcoder2" not in model_name.lower():
command += ["--use_flash_attention"]

if "starcoder2" in model_name.lower():
command += ["--flash_attention_recompute"]

=======
>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))
if (reuse_cache or torch_compile) and not parallel_strategy == "tp":
command += ["--reuse_cache"]

Expand Down Expand Up @@ -208,6 +211,17 @@ def _test_text_generation(
f"--max_input_tokens {max_input_tokens}",
"--limit_hpu_graphs",
]
<<<<<<< HEAD
if parallel_strategy is not None:
command += [
f"--parallel_strategy={parallel_strategy}",
]
=======

if gptq:
command += ["--gptq"]
>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))

if parallel_strategy is not None:
command += [
f"--parallel_strategy={parallel_strategy}",
Expand Down Expand Up @@ -325,6 +339,7 @@ def test_text_generation_distributed_tp(model_name: str, baseline: float, token:
torch_compile=True,
parallel_strategy="tp",
)
<<<<<<< HEAD


class TextGenPipeline(TestCase):
Expand Down Expand Up @@ -370,3 +385,5 @@ def test_text_generation_pipeline_falcon(self):

# Ensure the run finished without any issue
self.assertEqual(return_code, 0)
=======
>>>>>>> 7e939192 (Tensor parallel distributed strategy without using deepspeed (#321))

0 comments on commit e7375cf

Please sign in to comment.