Merge branch 'main' into transformers_future

huggingface · Sep 24, 2024 · 98b0da5 · 98b0da5
2 parents bf89e41 + 00dd5bf
commit 98b0da5
Show file tree

Hide file tree

Showing 76 changed files with 3,080 additions and 800 deletions.
diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
@@ -28,6 +28,8 @@ Models that have been validated:
   - [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
   - [llava-hf/llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)
   - [llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
+  - [llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)
+  - [llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)
 
 ### Inference with BF16
 
@@ -72,9 +74,26 @@ python3 run_pipeline.py \
     --bf16
 ```
 
-### Inference with FP8
+To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path llava-hf/llava-v1.6-34b-hf \
+    --use_hpu_graphs \
+    --bf16
+```
+
+To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
 
-Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
+    --use_hpu_graphs \
+    --bf16
+```
+
+### Inference with FP8
+Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
 More information on enabling FP8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html

diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
@@ -23,7 +23,7 @@
 import PIL.Image
 import requests
 import torch
-from transformers import AutoConfig, pipeline
+from transformers import AutoConfig, LlavaNextProcessor, LlavaProcessor, pipeline
 
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
@@ -36,6 +36,46 @@
 logger = logging.getLogger(__name__)
 
 
+def setup_quantization(model, args):
+    if os.getenv("USE_INC", "1") != "0":
+        try:
+            from neural_compressor.torch.quantization import FP8Config, convert, prepare
+        except ImportError:
+            raise ImportError(
+                "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
+            )
+
+        config = FP8Config.from_json_file(args.quant_config)
+        if config.measure:
+            model = prepare(model, config)
+        elif config.quantize:
+            model = convert(model, config)
+    else:
+        import habana_frameworks.torch.core as htcore
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+        htcore.hpu_initialize(model)
+
+    return model
+
+
+def finalize_quantization(model):
+    if os.getenv("USE_INC", "1") != "0":
+        try:
+            from neural_compressor.torch.quantization import finalize_calibration
+        except ImportError:
+            raise ImportError(
+                "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
+            )
+
+        finalize_calibration(model)
+    else:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.finish_measurements(model)
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -101,6 +141,11 @@ def main():
         action="store_true",
         help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
     )
+    parser.add_argument(
+        "--use_kv_cache",
+        action="store_true",
+        help="Whether to use the key/value cache for decoding. It should speed up generation.",
+    )
 
     args = parser.parse_args()
 
@@ -116,12 +161,21 @@ def main():
         args.image_path = [
             "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
         ]
-    if args.prompt is None and model_type == "llava":
-        args.prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
-    elif args.prompt is None and model_type == "llava_next":
-        args.prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-        if args.model_name_or_path in ["llava-hf/llava-v1.6-vicuna-13b-hf", "llava-hf/llava-v1.6-vicuna-7b-hf"]:
-            args.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
+    if args.prompt is None:
+        if model_type == "llava":
+            processor = LlavaProcessor.from_pretrained(args.model_name_or_path)
+        elif model_type == "llava_next":
+            processor = LlavaNextProcessor.from_pretrained(args.model_name_or_path)
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is shown in this image?"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        args.prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
     image_paths = args.image_path
     image_paths_len = len(image_paths)
@@ -157,6 +211,7 @@ def main():
     )
     generate_kwargs = {
         "lazy_mode": True,
+        "use_cache": args.use_kv_cache,
         "hpu_graphs": args.use_hpu_graphs,
         "max_new_tokens": args.max_new_tokens,
         "ignore_eos": args.ignore_eos,
@@ -169,18 +224,14 @@ def main():
         generator.model = wrap_in_hpu_graph(generator.model)
 
     if args.quant_config:
-        import habana_quantization_toolkit
-
-        habana_quantization_toolkit.prep_model(generator.model)
-
-        htcore.hpu_initialize(generator.model)
+        generator.model = setup_quantization(generator.model, args)
 
     # warm up
     for i in range(args.warmup):
         generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
     torch.hpu.synchronize()
     if args.quant_config:
-        habana_quantization_toolkit.finish_measurements(generator.model)
+        finalize_quantization(generator.model)
 
     start = time.perf_counter()
     for i in range(args.n_iterations):
@@ -197,8 +248,9 @@ def main():
 
     total_new_tokens_generated = args.n_iterations * n_output_tokens
     throughput = total_new_tokens_generated / duration
+    logger.info(f"result = {result}")
     logger.info(
-        f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
+        f"time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
     )
 
     # Store results if necessary

diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
@@ -701,8 +701,16 @@ def main():
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
     def tokenize(prompt, add_eos_token=True, add_bos_token=True):
-        add_eos_token_o = tokenizer.add_eos_token
-        add_bos_token_o = tokenizer.add_bos_token
+        if hasattr(tokenizer, "add_eos_token"):
+            add_eos_token_o = tokenizer.add_eos_token
+        else:
+            add_eos_token_o = None
+
+        if hasattr(tokenizer, "add_bos_token"):
+            add_bos_token_o = tokenizer.add_bos_token
+        else:
+            add_bos_token_o = None
+
         if not data_args.dataset_concatenation:
             tokenizer.add_eos_token = add_eos_token
             padding = "max_length"
@@ -717,8 +725,12 @@ def tokenize(prompt, add_eos_token=True, add_bos_token=True):
             return_tensors=None,
         )
         # restore original value
-        tokenizer.add_eos_token = add_eos_token_o
-        tokenizer.add_bos_token = add_bos_token_o
+        if add_eos_token_o is not None:
+            tokenizer.add_eos_token = add_eos_token_o
+
+        if add_bos_token_o is not None:
+            tokenizer.add_bos_token = add_bos_token_o
+
         for i in range(len(results["input_ids"])):
             if (
                 results["input_ids"][i][-1] != tokenizer.eos_token_id

diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
@@ -82,6 +82,9 @@ def convert_outputs_to_pdb(outputs):
 test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"  # len = 350
 
 tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+# Set _supports_param_buffer_assignment to False since facebook/esmfold_v1's encoder weights are float16.
+# Without this fix, we will have the weights loaded with float16 on gaudi2,gaudi3 and runtime error on gaudi1
+EsmForProteinFolding._supports_param_buffer_assignment = False
 model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=False)
 model = model.to(device)
 

diff --git a/examples/sentence-transformers-training/nli/README.md b/examples/sentence-transformers-training/nli/README.md
@@ -4,6 +4,8 @@ Given two sentences (premise and hypothesis), the task of Natural Language Infer
 
 The paper in [Conneau et al.](https://arxiv.org/abs/1705.02364) shows that NLI data can be quite useful when training Sentence Embedding methods. In [Sentence-BERT-Paper](https://arxiv.org/abs/1908.10084) NLI as a first fine-tuning step for sentence embedding methods has been used.
 
+# General Models
+
 ## Single-card Training
 
 To pre-train on the NLI task:
@@ -46,7 +48,29 @@ For multi-card training you can use the script of [gaudi_spawn.py](https://githu
 HABANA_VISIBLE_MODULES="2,3" python ../../gaudi_spawn.py --use_deepspeed --world_size 2 training_nli.py bert-base-uncased
 ```
 
-## Dataset
+
+# Large Models (intfloat/e5-mistral-7b-instruct)
+
+## Single-card Training with LoRA+gradient_checkpointing
+
+Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we can utilize LoRA and gradient checkpointing techniques to reduce the memory requirements, making it feasible to train the model on a single HPU.
+
+```bash
+python training_nli.py intfloat/e5-mistral-7b-instruct --peft --lora_target_module "q_proj" "k_proj" "v_proj" --learning_rate 1e-5
+```
+
+## Multi-card Training with Deepspeed Zero2/3
+
+Pretraining the `intfloat/e5-mistral-7b-instruct` model requires approximately 130GB of memory, which exceeds the capacity of a single HPU (Gaudi 2 with 98GB memory). To address this, we can use the Zero2/Zero3 stages of DeepSpeed (model parallelism) to reduce the memory requirements.
+
+Our tests have shown that training this model requires at least four HPUs when using DeepSpeed Zero2.
+
+```bash
+python ../../gaudi_spawn.py --world_size 4 --use_deepspeed training_nli.py intfloat/e5-mistral-7b-instruct --deepspeed ds_config.json --bf16 --no-use_hpu_graphs_for_training --learning_rate 1e-7
+```
+In the above command, we need to enable lazy mode with a learning rate of `1e-7` and configure DeepSpeed using the `ds_config.json` file. To further reduce memory usage, change the stage to 3 (DeepSpeed Zero3) in the `ds_config.json` file.
+
+# Dataset
 
 We combine [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNLI](https://huggingface.co/datasets/nyu-mll/multi_nli) into a dataset we call [AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli). These two datasets contain sentence pairs and one of three labels: entailment, neutral, contradiction:
 
@@ -58,7 +82,7 @@ We combine [SNLI](https://huggingface.co/datasets/stanfordnlp/snli) and [MultiNL
 
 We format AllNLI in a few different subsets, compatible with different loss functions. See [triplet subset of AllNLI](https://huggingface.co/datasets/sentence-transformers/all-nli/viewer/triplet) as example.
 
-## SoftmaxLoss
+# SoftmaxLoss
 
 <img src="https://raw.githubusercontent.com/UKPLab/sentence-transformers/master/docs/img/SBERT_SoftmaxLoss.png" alt="SBERT SoftmaxLoss" width="250"/>
 

diff --git a/examples/sentence-transformers-training/nli/ds_config.json b/examples/sentence-transformers-training/nli/ds_config.json
@@ -0,0 +1,16 @@
+{
+    "steps_per_print": 1,
+    "train_batch_size": "auto",
+    "train_micro_batch_size_per_gpu": "auto",
+    "gradient_accumulation_steps": "auto",
+    "bf16": {
+        "enabled": true
+    },
+    "gradient_clipping": 1.0,
+    "zero_optimization": {
+        "stage": 2,
+        "overlap_comm": false,
+        "reduce_scatter": false,
+        "contiguous_gradients": false
+    }
+}
diff --git a/examples/sentence-transformers-training/nli/training_nli.py b/examples/sentence-transformers-training/nli/training_nli.py
@@ -4,8 +4,8 @@
 STS benchmark dataset
 """
 
+import argparse
 import logging
-import sys
 from datetime import datetime
 
 from datasets import load_dataset
@@ -28,16 +28,43 @@ def main():
     logging.basicConfig(format="%(asctime)s - %(message)s", datefmt="%Y-%m-%d %H:%M:%S", level=logging.INFO)
 
     # You can specify any Hugging Face pre-trained model here, for example, bert-base-uncased, roberta-base, xlm-roberta-base
-    model_name = sys.argv[1] if len(sys.argv) > 1 else "bert-base-uncased"
-    train_batch_size = 16
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_name", help="model name or path", default="bert-base-uncased", nargs="?")
+    parser.add_argument("--peft", help="use LoRA", action="store_true", default=False)
+    parser.add_argument("--lora_target_modules", nargs="+", default=["query", "key", "value"])
+    parser.add_argument("--bf16", help="use bf16", action="store_true", default=False)
+    parser.add_argument(
+        "--use_hpu_graphs_for_training",
+        help="use hpu graphs for training",
+        action=argparse.BooleanOptionalAction,
+        default=True,
+    )
+    parser.add_argument("--learning_rate", help="learning rate", type=float, default=5e-5)
+    parser.add_argument("--deepspeed", help="deepspeed config file", default=None)
+    parser.add_argument("--train_batch_size", help="train batch size", default=16, type=int)
+    args = parser.parse_args()
 
     output_dir = (
-        "output/training_nli_" + model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+        "output/training_nli_" + args.model_name.replace("/", "-") + "-" + datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
     )
 
     # 1. Here we define our SentenceTransformer model. If not already a Sentence Transformer model, it will automatically
     # create one with "mean" pooling.
-    model = SentenceTransformer(model_name)
+    model = SentenceTransformer(args.model_name)
+    if args.peft:
+        from peft import LoraConfig, get_peft_model
+
+        peft_config = LoraConfig(
+            r=16,
+            lora_alpha=64,
+            lora_dropout=0.05,
+            bias="none",
+            inference_mode=False,
+            target_modules=args.lora_target_modules,
+        )
+        model = get_peft_model(model, peft_config)
+        model.print_trainable_parameters()
+        model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False})
 
     # 2. Load the AllNLI dataset: https://huggingface.co/datasets/sentence-transformers/all-nli
     # We'll start with 10k training samples, but you can increase this to get a stronger model
@@ -66,16 +93,16 @@ def main():
     dev_evaluator(model)
 
     # 5. Define the training arguments
-    args = SentenceTransformerGaudiTrainingArguments(
+    stargs = SentenceTransformerGaudiTrainingArguments(
         # Required parameter:
         output_dir=output_dir,
         # Optional training parameters:
         num_train_epochs=1,
-        per_device_train_batch_size=train_batch_size,
-        per_device_eval_batch_size=train_batch_size,
+        per_device_train_batch_size=args.train_batch_size,
+        per_device_eval_batch_size=args.train_batch_size,
         warmup_ratio=0.1,
         # fp16=True,  # Set to False if you get an error that your GPU can't run on FP16
-        # bf16=False,  # Set to True if you have a GPU that supports BF16
+        bf16=args.bf16,  # Set to True if you have a GPU that supports BF16
         # Optional tracking/debugging parameters:
         evaluation_strategy="steps",
         eval_steps=100,
@@ -87,16 +114,18 @@ def main():
         use_habana=True,
         gaudi_config_name="Habana/bert-base-uncased",
         use_lazy_mode=True,
-        use_hpu_graphs=True,
+        use_hpu_graphs=args.use_hpu_graphs_for_training,
         use_hpu_graphs_for_inference=False,
-        use_hpu_graphs_for_training=True,
+        use_hpu_graphs_for_training=args.use_hpu_graphs_for_training,
         dataloader_drop_last=True,
+        learning_rate=args.learning_rate,
+        deepspeed=args.deepspeed,
     )
 
     # 6. Create the trainer & start training
     trainer = SentenceTransformerGaudiTrainer(
         model=model,
-        args=args,
+        args=stargs,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         loss=train_loss,
@@ -119,6 +148,11 @@ def main():
     final_output_dir = f"{output_dir}/final"
     model.save(final_output_dir)
 
+    if args.peft:
+        model.eval()
+        model = model.merge_and_unload()
+        model.save_pretrained(f"{output_dir}/merged")
+
 
 if __name__ == "__main__":
     main()