Merge branch 'synapse_1.17' of https://github.com/huggingface/optimum…

…-habana into synapse_1.17
imangohari1 · Aug 8, 2024 · faa5ea9 · faa5ea9
2 parents 81851a4 + d99fe02
commit faa5ea9
Show file tree

Hide file tree

Showing 11 changed files with 85 additions and 73 deletions.
diff --git a/examples/language-modeling/peft_poly_seq2seq_with_generate.py b/examples/language-modeling/peft_poly_seq2seq_with_generate.py
@@ -233,7 +233,7 @@ def main():
 
     # boolq
     boolq_dataset = (
-        load_dataset("super_glue", "boolq")
+        load_dataset("super_glue", "boolq", trust_remote_code=model_args.trust_remote_code)
         .map(
             lambda x: {
                 "input": f"{x['passage']}\nQuestion: {x['question']}\nA. Yes\nB. No\nAnswer:",
@@ -248,7 +248,7 @@ def main():
 
     # multirc
     multirc_dataset = (
-        load_dataset("super_glue", "multirc")
+        load_dataset("super_glue", "multirc", trust_remote_code=model_args.trust_remote_code)
         .map(
             lambda x: {
                 "input": (
@@ -266,7 +266,7 @@ def main():
 
     # rte
     rte_dataset = (
-        load_dataset("super_glue", "rte")
+        load_dataset("super_glue", "rte", trust_remote_code=model_args.trust_remote_code)
         .map(
             lambda x: {
                 "input": (
@@ -284,7 +284,7 @@ def main():
 
     # wic
     wic_dataset = (
-        load_dataset("super_glue", "wic")
+        load_dataset("super_glue", "wic", trust_remote_code=model_args.trust_remote_code)
         .map(
             lambda x: {
                 "input": (
@@ -390,6 +390,8 @@ def compute_metrics(eval_preds):
         token=model_args.token,
     )
     peft_model = get_peft_model(model, peft_config)
+    if training_args.bf16:
+        peft_model = peft_model.to(torch.bfloat16)
     peft_model.print_trainable_parameters()
 
     # training and evaluation

diff --git a/examples/language-modeling/run_multitask_prompt_tuning.py b/examples/language-modeling/run_multitask_prompt_tuning.py
@@ -18,6 +18,7 @@
 https://github.com/huggingface/peft/blob/main/examples/conditional_generation/multitask_prompt_tuning.ipynb
 """
 
+import copy
 import logging
 import sys
 from dataclasses import dataclass, field
@@ -346,6 +347,8 @@ def compute_metrics(pred):
         low_cpu_mem_usage=model_args.low_cpu_mem_usage,
         token=model_args.token,
     )
+    model_target = copy.deepcopy(model)
+
     peft_model = get_peft_model(model, peft_config)
     peft_model.print_trainable_parameters()
 
@@ -386,7 +389,7 @@ def compute_metrics(pred):
         trainer.save_metrics("eval", metrics)
 
     # target train
-    peft_model = get_peft_model(model, peft_config_target)
+    peft_model = get_peft_model(model_target, peft_config_target)
     peft_model.print_trainable_parameters()
     trainer = GaudiSeq2SeqTrainer(
         model=peft_model,

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
@@ -573,6 +573,8 @@ deepspeed --num_gpus 8 run_lm_eval.py \
 -o eval.json
 ```
 
+> If the dataset you want to use requires the execution of remote code, please set the following environment variable: `HF_DATASETS_TRUST_REMOTE_CODE=true`
+
 
 ## Text-Generation Pipeline
 

diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
@@ -20,18 +20,17 @@
 import argparse
 import json
 import logging
-import os
-
-
-os.environ.setdefault("HF_DATASETS_TRUST_REMOTE_CODE", "true")
 import multiprocessing as mp
+import os
 import time
 
 import lm_eval.evaluator
 import lm_eval.tasks
 import psutil
 import torch
 import torch.nn.functional as F
+
+# Local imports
 from run_generation import setup_parser
 from utils import finalize_quantization, initialize_model
 

diff --git a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -199,8 +199,9 @@ def gaudi_gpt_bigcode_attention_forward(
     if layer_past is not None:
         past_key, past_value = layer_past.split((self.head_dim, self.head_dim), dim=-1)
         if token_idx is not None:
-            key = past_key.index_add_(1, token_idx - 1, key - torch.index_select(past_key, 1, token_idx - 1))
-            value = past_value.index_add_(1, token_idx - 1, value - torch.index_select(past_value, 1, token_idx - 1))
+            # Using out of place version of index_add_() to ensure the intermediate tensors are not lost when HPU graphs are enabled.
+            key = past_key.index_add(1, token_idx - 1, key - torch.index_select(past_key, 1, token_idx - 1))
+            value = past_value.index_add(1, token_idx - 1, value - torch.index_select(past_value, 1, token_idx - 1))
         else:
             key = torch.cat((past_key, key), dim=-2)
             value = torch.cat((past_value, value), dim=-2)

diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -537,16 +537,20 @@ def pre_attn_forward(
                     else:
                         kv_seq_len = past_key_value[0].shape[-2]
 
-        if position_embeddings is None:
-            logger.warning_once(
-                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
-                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
-                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
-                "removed and `position_embeddings` will be mandatory."
-            )
-            cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
-        else:
-            cos, sin = position_embeddings
+        # TODO: the following section cause torch.compile performance issue with graph recompilation
+        # as we are not using position_embeddings, disable it for now
+        # if position_embeddings is None:
+        # logger.warning_once(
+        # "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+        # "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+        # "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.45 `position_ids` will be "
+        # "removed and `position_embeddings` will be mandatory."
+        # )
+        # cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        # else:
+        # cos, sin = position_embeddings
+
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
 
         if use_cache:

diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -98,6 +98,17 @@ def compute_num_masked_span(input_length):
         num_masked_span = compute_num_masked_span(input_length)
 
         # get random indices to mask
+        """
+        Original code:
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+        When (input_length - (mask_length - 1) < 0), then num_masked_span=0
+        and we get: spec_aug_mask_idx=array([], dtype=int64)
+        However torch rewrite fails, because torch.randperm expects positive number
+        This causes a unit test to fail:
+        RUN_SLOW=true  GAUDI2_CI=1 python -m pytest tests/transformers/tests/models/wav2vec2/test_modeling_wav2vec2.py -v -s -k test_compute_mask_indices_short_audio
+        """
         spec_aug_mask_idx = torch.randperm(input_length - (mask_length - 1), device="hpu")[:num_masked_span]
 
         # pick first sampled index that will serve as a dummy index to pad vector

diff --git a/tests/baselines/t5_small.json b/tests/baselines/t5_small.json
@@ -113,7 +113,8 @@
                         "--use_hpu_graphs_for_training",
                         "--max_source_length 256",
                         "--max_target_length 16",
-                        "--bf16"
+                        "--bf16",
+                        "--trust_remote_code True"
                     ]
                 }
             }
@@ -135,10 +136,11 @@
                         "--max_target_length 2",
                         "--max_train_samples 1000",
                         "--max_eval_samples 100",
-                        "--bf16"
+                        "--bf16",
+                        "--trust_remote_code True"
                     ]
                 }
             }
         }
     }
-}
+}
diff --git a/tests/transformers/tests/generation/test_utils.py b/tests/transformers/tests/generation/test_utils.py
@@ -23,6 +23,7 @@
 from transformers import is_torch_available, pipeline
 from transformers.testing_utils import require_torch, slow
 
+from optimum.habana.checkpoint_utils import model_is_optimized
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
 from ..test_modeling_common import floats_tensor, ids_tensor
@@ -1345,6 +1346,9 @@ def test_contrastive_generate_dynamic_shapes(self):
 
             config, input_ids, attention_mask, max_length = self._get_input_ids_and_config(batch_size=1)
 
+            if not model_is_optimized(config):
+                return
+
             # NOTE: contrastive search only works with cache on at the moment.
             if not hasattr(config, "use_cache"):
                 return

diff --git a/tests/transformers/tests/models/roberta/test_modeling_roberta.py b/tests/transformers/tests/models/roberta/test_modeling_roberta.py
@@ -480,7 +480,7 @@ def test_for_question_answering(self):
     @slow
     def test_model_from_pretrained(self):
         model_name = "FacebookAI/roberta-base"
-        model = RobertaModel.from_pretrained(model_name)
+        model = RobertaModel.from_pretrained(model_name).to(torch_device)
         self.assertIsNotNone(model)
 
     def test_create_position_ids_respects_padding_index(self):
@@ -491,12 +491,12 @@ def test_create_position_ids_respects_padding_index(self):
         first available non-padding position index is RobertaEmbeddings.padding_idx + 1
         """
         config = self.model_tester.prepare_config_and_inputs()[0]
-        model = RobertaEmbeddings(config=config)
+        model = RobertaEmbeddings(config=config).to(torch_device)
 
         input_ids = torch.as_tensor([[12, 31, 13, model.padding_idx]])
         expected_positions = torch.as_tensor(
             [[0 + model.padding_idx + 1, 1 + model.padding_idx + 1, 2 + model.padding_idx + 1, model.padding_idx]]
-        )
+        ).to(torch_device)
 
         position_ids = create_position_ids_from_input_ids(input_ids, model.padding_idx)
         self.assertEqual(position_ids.shape, expected_positions.shape)
@@ -510,7 +510,7 @@ def test_create_position_ids_from_inputs_embeds(self):
         first available non-padding position index is RobertaEmbeddings.padding_idx + 1
         """
         config = self.model_tester.prepare_config_and_inputs()[0]
-        embeddings = RobertaEmbeddings(config=config)
+        embeddings = RobertaEmbeddings(config=config).to(torch_device)
 
         inputs_embeds = torch.empty(2, 4, 30)
         expected_single_positions = [
@@ -519,65 +519,39 @@ def test_create_position_ids_from_inputs_embeds(self):
             2 + embeddings.padding_idx + 1,
             3 + embeddings.padding_idx + 1,
         ]
-        expected_positions = torch.as_tensor([expected_single_positions, expected_single_positions])
+        expected_positions = torch.as_tensor(
+            [expected_single_positions, expected_single_positions], device=torch_device
+        )
         position_ids = embeddings.create_position_ids_from_inputs_embeds(inputs_embeds)
         self.assertEqual(position_ids.shape, expected_positions.shape)
         self.assertTrue(torch.all(torch.eq(position_ids, expected_positions)))
 
 
 @require_torch
 class RobertaModelIntegrationTest(TestCasePlus):
-    @slow
-    def test_inference_masked_lm(self):
-        model = RobertaForMaskedLM.from_pretrained("roberta-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+    def _helper(self, model, device, input_ids):
+        model = model.to(device)
+        input_ids = input_ids.to(device)
         with torch.no_grad():
             output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 11, 50265))
-        self.assertEqual(output.shape, expected_shape)
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[33.8802, -4.3103, 22.7761], [4.6539, -2.8098, 13.6253], [1.8228, -3.6898, 8.8600]]]
-        )
+        return output
 
-        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
-        # roberta.eval()
-        # expected_slice = roberta.model.forward(input_ids)[0][:, :3, :3].detach()
+    def _compare_cpu_hpu(self, model, input_ids=None, atol=0.2):
+        if input_ids is None:
+            input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
+        out_cpu = self._helper(model, "cpu", input_ids)
+        out_hpu = self._helper(model, "hpu", input_ids)
+        self.assertEqual(out_cpu.shape, out_hpu.shape)
+        self.assertTrue(torch.allclose(out_cpu, out_hpu, atol=atol))
 
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+    @slow
+    def test_inference_masked_lm(self):
+        self._compare_cpu_hpu(RobertaForMaskedLM.from_pretrained("roberta-base"), atol=0.15)
 
     @slow
     def test_inference_no_head(self):
-        model = RobertaModel.from_pretrained("roberta-base")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        # compare the actual values for a slice.
-        expected_slice = torch.tensor(
-            [[[-0.0231, 0.0782, 0.0074], [-0.1854, 0.0540, -0.0175], [0.0548, 0.0799, 0.1687]]]
-        )
-
-        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.base')
-        # roberta.eval()
-        # expected_slice = roberta.extract_features(input_ids)[:, :3, :3].detach()
-
-        self.assertTrue(torch.allclose(output[:, :3, :3], expected_slice, atol=1e-4))
+        self._compare_cpu_hpu(RobertaModel.from_pretrained("roberta-base"), atol=0.05)
 
     @slow
     def test_inference_classification_head(self):
-        model = RobertaForSequenceClassification.from_pretrained("roberta-large-mnli")
-
-        input_ids = torch.tensor([[0, 31414, 232, 328, 740, 1140, 12695, 69, 46078, 1588, 2]])
-        with torch.no_grad():
-            output = model(input_ids)[0]
-        expected_shape = torch.Size((1, 3))
-        self.assertEqual(output.shape, expected_shape)
-        expected_tensor = torch.tensor([[-0.9469, 0.3913, 0.5118]])
-
-        # roberta = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
-        # roberta.eval()
-        # expected_tensor = roberta.predict("mnli", input_ids, return_logits=True).detach()
-
-        self.assertTrue(torch.allclose(output, expected_tensor, atol=1e-4))
+        self._compare_cpu_hpu(RobertaForSequenceClassification.from_pretrained("roberta-large-mnli"), atol=0.02)
diff --git a/tests/transformers/tests/test_modeling_common.py b/tests/transformers/tests/test_modeling_common.py
@@ -28,6 +28,7 @@
 from typing import Dict, List, Tuple
 
 import numpy as np
+import transformers
 from pytest import mark
 from transformers import (
     AutoModel,
@@ -556,6 +557,10 @@ def test_attention_outputs(self):
             inputs_dict["output_attentions"] = True
             inputs_dict["output_hidden_states"] = False
             config.return_dict = True
+            if isinstance(config, transformers.ViTConfig):
+                # in latest upgrade there are 2 impls of attention:
+                # https://github.com/huggingface/transformers/blob/7ad784ae9da9b8ce61ba734199fb258d8d95460f/src/transformers/models/vit/modeling_vit.py#L363
+                config._attn_implementation = "eager"
             model = model_class(config)
             model.to(torch_device)
             model.eval()
@@ -1196,6 +1201,11 @@ def test_retain_grad_hidden_states_attentions(self):
         config.output_hidden_states = True
         config.output_attentions = self.has_attentions
 
+        if isinstance(config, transformers.ViTConfig):
+            # in latest upgrade there are 2 impls of attention:
+            # https://github.com/huggingface/transformers/blob/7ad784ae9da9b8ce61ba734199fb258d8d95460f/src/transformers/models/vit/modeling_vit.py#L363
+            config._attn_implementation = "eager"
+
         # no need to test all models as different heads yield the same functionality
         model_class = self.all_model_classes[0]
         model = model_class(config)