diff --git a/models_v2/common/enable_ipex_for_transformers.diff b/models_v2/common/enable_ipex_for_transformers.diff index a90e497f6..101335daa 100644 --- a/models_v2/common/enable_ipex_for_transformers.diff +++ b/models_v2/common/enable_ipex_for_transformers.diff @@ -1,5 +1,5 @@ diff --git a/examples/legacy/question-answering/run_squad.py b/examples/legacy/question-answering/run_squad.py -index fc9411e95..cdc3cdbeb 100644 +index 999752485..066f0a367 100644 --- a/examples/legacy/question-answering/run_squad.py +++ b/examples/legacy/question-answering/run_squad.py @@ -22,6 +22,9 @@ import logging @@ -89,7 +89,7 @@ index fc9411e95..cdc3cdbeb 100644 + # enable fusion path work(need to run two interation). + with torch.no_grad(): + y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) -+ y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) ++ y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) + #dumpy_tensor = torch.ones((128, 384), dtype=torch.long) + #y = model(dumpy_tensor, dumpy_tensor, dumpy_tensor) + #dumpy_tensor = torch.ones((81, 384), dtype=torch.long) @@ -150,7 +150,7 @@ index fc9411e95..cdc3cdbeb 100644 + "input_ids": {0: torch.export.Dim("dim", max=1024 * 1024)}, + "attention_mask": {0: torch.export.Dim("dim", max=1024 * 1024)}, + "token_type_ids": {0: torch.export.Dim("dim", max=1024 * 1024)} -+ } ++ } + exported_model = capture_pre_autograd_graph( + model, + (), @@ -193,7 +193,7 @@ index fc9411e95..cdc3cdbeb 100644 + cpu_pool=cpu_pool, + input_split_hint = multi_stream_input_hint, + output_concat_hint = multi_stream_output_hint) -+ return model ++ return model + +def benchmark_evaluate(args, model, eval_dataloader): + steps_per_epoch = len(eval_dataloader) @@ -226,7 +226,7 @@ index fc9411e95..cdc3cdbeb 100644 + timeBuff = np.asarray(timeBuff) + p99 = np.percentile(timeBuff, 99) + print('P99 Latency {:.2f} ms'.format(p99*1000)) -+ print("Throughput: {:.3f} sentence/s".format(throughput)) ++ print("Throughput: {:.3f} sentence/s".format(throughput)) + break + import contextlib + maybe_autocast = torch.cpu.amp.autocast(enabled=args.bf16 or args.int8_bf16 or args.fp16_cpu, dtype=torch.half if args.fp16_cpu else torch.bfloat16) if args.inductor else contextlib.nullcontext() @@ -241,7 +241,7 @@ index fc9411e95..cdc3cdbeb 100644 + #print("inputs type is: {}".format(type(inputs))) + #print("inputs is: {}".format(inputs)) + -+ outputs = model(**inputs) ++ outputs = model(**inputs) + + # print("outputs type is: {}".format(type(outputs))) + # print("outputs len is: {}".format(len(outputs))) @@ -322,7 +322,7 @@ index fc9411e95..cdc3cdbeb 100644 + for t in threads: + t.join() + else: -+ benchmark_evaluate(args, model, eval_dataloader) ++ benchmark_evaluate(args, model, eval_dataloader) + exit() all_results = [] @@ -386,7 +386,7 @@ index fc9411e95..cdc3cdbeb 100644 + help='use int8 fp32 mix precision') + parser.add_argument('--int8_bf16', dest='int8_bf16', action='store_true', + help='use int8 bf16 mix precision') -+ parser.add_argument("--int8_config", type=str, default="config.json", ++ parser.add_argument("--int8_config", type=str, default="config.json", + help="quantization config file for int8 mode") + parser.add_argument('--fp8', dest='fp8', action='store_true', + help='use FP8') @@ -451,10 +451,10 @@ index fc9411e95..cdc3cdbeb 100644 # Evaluate diff --git a/examples/pytorch/image-classification/run_image_classification.py b/examples/pytorch/image-classification/run_image_classification.py -index e2ce9f1c5..c996c3423 100644 +index 27e9223b8..2135dc138 100755 --- a/examples/pytorch/image-classification/run_image_classification.py +++ b/examples/pytorch/image-classification/run_image_classification.py -@@ -221,28 +221,52 @@ def main(): +@@ -250,40 +250,64 @@ def main(): # Set seed before initializing model. set_seed(training_args.seed) @@ -465,8 +465,7 @@ index e2ce9f1c5..c996c3423 100644 - data_args.dataset_name, - data_args.dataset_config_name, - cache_dir=model_args.cache_dir, -- task="image-classification", -- use_auth_token=True if model_args.use_auth_token else None, +- token=model_args.token, - ) + if data_args.dataset_name is not None and data_args.dataset_name == "dummy": + from datasets import ClassLabel @@ -475,22 +474,22 @@ index e2ce9f1c5..c996c3423 100644 + self.num_samples = num_samples + self.image_size = image_size + self.num_classes = num_classes -+ self.data = [torch.randn(*self.image_size) for i in range(num_samples)] -+ self.labels = [torch.randint(0, self.num_classes, (1,)).item() for i in range(num_samples)] ++ self.data = [torch.randn(*self.image_size) for i in range(self.num_samples)] ++ self.labels = [torch.randint(0, self.num_classes, (1,)).item() for i in range(self.num_samples)] + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): -+ return {"pixel_values": self.data[idx], "labels": self.labels[idx]} ++ return {"pixel_values": self.data[idx], "label": self.labels[idx]} + + @property + def features(self): -+ return {"labels": ClassLabel(names=[str(i) for i in range(self.num_classes)])} ++ return {"label": ClassLabel(names=[str(i) for i in range(self.num_classes)])} + + def set_transform(self, transform=None): + pass -+ dataset = {'train': DummyImageDataset(), 'validation': DummyImageDataset()} ++ dataset = {'train': DummyImageDataset(),'validation': DummyImageDataset()} else: - data_files = {} - if data_args.train_dir is not None: @@ -501,7 +500,6 @@ index e2ce9f1c5..c996c3423 100644 - "imagefolder", - data_files=data_files, - cache_dir=model_args.cache_dir, -- task="image-classification", - ) + # Initialize our dataset and prepare it for the 'image-classification' task. + if data_args.dataset_name is not None: @@ -509,8 +507,7 @@ index e2ce9f1c5..c996c3423 100644 + data_args.dataset_name, + data_args.dataset_config_name, + cache_dir=model_args.cache_dir, -+ task="image-classification", -+ use_auth_token=True if model_args.use_auth_token else None, ++ token=model_args.token, + revision="014711311cec8b5959350c373878a3311caeb764", + ) + else: @@ -523,49 +520,75 @@ index e2ce9f1c5..c996c3423 100644 + "imagefolder", + data_files=data_files, + cache_dir=model_args.cache_dir, -+ task="image-classification", + ) - # If we don't have a validation split, split off a percentage of train as validation. - data_args.train_val_split = None if "validation" in dataset.keys() else data_args.train_val_split -@@ -275,6 +299,7 @@ def main(): +- dataset_column_names = dataset["train"].column_names if "train" in dataset else dataset["validation"].column_names +- if data_args.image_column_name not in dataset_column_names: +- raise ValueError( +- f"--image_column_name {data_args.image_column_name} not found in dataset '{data_args.dataset_name}'. " +- "Make sure to set `--image_column_name` to the correct audio column - one of " +- f"{', '.join(dataset_column_names)}." +- ) +- if data_args.label_column_name not in dataset_column_names: +- raise ValueError( +- f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. " +- "Make sure to set `--label_column_name` to the correct text column - one of " +- f"{', '.join(dataset_column_names)}." +- ) ++ dataset_column_names = dataset["train"].column_names if "train" in dataset else dataset["validation"].column_names ++ if data_args.image_column_name not in dataset_column_names: ++ raise ValueError( ++ f"--image_column_name {data_args.image_column_name} not found in dataset '{data_args.dataset_name}'. " ++ "Make sure to set `--image_column_name` to the correct audio column - one of " ++ f"{', '.join(dataset_column_names)}." ++ ) ++ if data_args.label_column_name not in dataset_column_names: ++ raise ValueError( ++ f"--label_column_name {data_args.label_column_name} not found in dataset '{data_args.dataset_name}'. " ++ "Make sure to set `--label_column_name` to the correct text column - one of " ++ f"{', '.join(dataset_column_names)}." ++ ) + + def collate_fn(examples): + pixel_values = torch.stack([example["pixel_values"] for example in examples]) +@@ -321,6 +345,7 @@ def main(): id2label=id2label, finetuning_task="image-classification", cache_dir=model_args.cache_dir, + return_dict = False, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, diff --git a/examples/pytorch/question-answering/run_qa.py b/examples/pytorch/question-answering/run_qa.py -index 9dc3b2c81..0c21b2dd9 100755 +index 9edca7b13..f3fde52fa 100755 --- a/examples/pytorch/question-answering/run_qa.py +++ b/examples/pytorch/question-answering/run_qa.py -@@ -322,6 +322,7 @@ def main(): - model_args.config_name if model_args.config_name else model_args.model_name_or_path, - cache_dir=model_args.cache_dir, +@@ -350,6 +350,7 @@ def main(): revision=model_args.model_revision, + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, + return_dict=False, - use_auth_token=True if model_args.use_auth_token else None, ) tokenizer = AutoTokenizer.from_pretrained( + model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, diff --git a/examples/pytorch/text-classification/run_glue.py b/examples/pytorch/text-classification/run_glue.py -index c14107d89..fdc7d03f6 100755 +index 054fcd776..4ad27e7a2 100755 --- a/examples/pytorch/text-classification/run_glue.py +++ b/examples/pytorch/text-classification/run_glue.py -@@ -360,6 +360,7 @@ def main(): +@@ -386,6 +386,7 @@ def main(): num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, + return_dict = False, revision=model_args.model_revision, - use_auth_token=True if model_args.use_auth_token else None, - ) -@@ -487,7 +488,12 @@ def main(): - if data_args.task_name is not None: - metric = evaluate.load("glue", data_args.task_name) + token=model_args.token, + trust_remote_code=model_args.trust_remote_code, +@@ -518,7 +519,12 @@ def main(): + elif is_regression: + metric = evaluate.load("mse", cache_dir=model_args.cache_dir) else: -- metric = evaluate.load("accuracy") -+ #metric = evaluate.load("accuracy") +- metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir) ++ #metric = evaluate.load("accuracy", cache_dir=model_args.cache_dir) + curpath = os.path.abspath(os.path.dirname(__file__)) + curpath = curpath.replace("/transformers/examples/pytorch/text-classification", '') + accuracy_path = os.path.join( curpath, "accuracy.py") @@ -574,7 +597,7 @@ index c14107d89..fdc7d03f6 100755 # You can define your custom compute_metrics function. It takes an `EvalPrediction` object (a namedtuple with a # predictions and label_ids field) and has to return a dictionary string to float. -@@ -575,6 +581,7 @@ def main(): +@@ -601,6 +607,7 @@ def main(): trainer.log_metrics("eval", metrics) trainer.save_metrics("eval", combined if task is not None and "mnli" in task else metrics) @@ -583,21 +606,21 @@ index c14107d89..fdc7d03f6 100755 if training_args.do_predict: logger.info("*** Predict ***") diff --git a/src/transformers/activations.py b/src/transformers/activations.py -index 587dc2e59..b4e331e28 100644 +index 22f5fe9b1..12d20f226 100644 --- a/src/transformers/activations.py +++ b/src/transformers/activations.py -@@ -53,8 +53,7 @@ class NewGELUActivation(nn.Module): +@@ -54,8 +54,7 @@ class NewGELUActivation(nn.Module): """ def forward(self, input: Tensor) -> Tensor: - return 0.5 * input * (1.0 + torch.tanh(math.sqrt(2.0 / math.pi) * (input + 0.044715 * torch.pow(input, 3.0)))) - -+ return nn.functional.gelu(input, approximate='tanh') ++ return nn.functional.gelu(input, approximate='tanh') class GELUActivation(nn.Module): """ diff --git a/src/transformers/generation/utils.py b/src/transformers/generation/utils.py -index ae12ae293..e6a08d367 100644 +index 08fde5850..0f71175f6 100644 --- a/src/transformers/generation/utils.py +++ b/src/transformers/generation/utils.py @@ -16,6 +16,8 @@ @@ -609,7 +632,7 @@ index ae12ae293..e6a08d367 100644 import warnings from dataclasses import dataclass from typing import TYPE_CHECKING, Any, Callable, Dict, List, Optional, Tuple, Union -@@ -700,6 +702,9 @@ class GenerationMixin: +@@ -629,6 +631,9 @@ class GenerationMixin: def _extract_past_from_model_output(self, outputs: ModelOutput, standardize_cache_format: bool = False): past_key_values = None @@ -619,7 +642,7 @@ index ae12ae293..e6a08d367 100644 if "past_key_values" in outputs: past_key_values = outputs.past_key_values elif "mems" in outputs: -@@ -1208,6 +1213,11 @@ class GenerationMixin: +@@ -1321,6 +1326,11 @@ class GenerationMixin: # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call self._validate_model_class() @@ -631,7 +654,7 @@ index ae12ae293..e6a08d367 100644 # priority: `generation_config` argument > `model.generation_config` (the default generation config) if generation_config is None: -@@ -2186,6 +2196,7 @@ class GenerationMixin: +@@ -2341,6 +2351,7 @@ class GenerationMixin: ["It might be possible to get a better understanding of the nature of the problem, but it's not"] ```""" # init values @@ -639,7 +662,7 @@ index ae12ae293..e6a08d367 100644 logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() if max_length is not None: -@@ -2231,6 +2242,7 @@ class GenerationMixin: +@@ -2387,6 +2398,7 @@ class GenerationMixin: this_peer_finished = False # used by synced_gpus only while True: @@ -647,7 +670,7 @@ index ae12ae293..e6a08d367 100644 if synced_gpus: # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # The following logic allows an early break if all peers finished generating their sequence -@@ -2243,19 +2255,95 @@ class GenerationMixin: +@@ -2399,19 +2411,95 @@ class GenerationMixin: # prepare model inputs model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) @@ -756,33 +779,35 @@ index ae12ae293..e6a08d367 100644 # pre-process distribution next_tokens_scores = logits_processor(input_ids, next_token_logits) -@@ -2302,6 +2390,7 @@ class GenerationMixin: - ) +@@ -2463,6 +2551,8 @@ class GenerationMixin: + if unfinished_sequences.max() == 0: + this_peer_finished = True - # stop when each sentence is finished, or if we exceed the maximum length + latency_list.append(time.time() - tic) - if unfinished_sequences.max() == 0 or stopping_criteria(input_ids, scores): - if not synced_gpus: - break -@@ -2313,7 +2402,7 @@ class GenerationMixin: ++ + # stop if we exceed the maximum length + if stopping_criteria(input_ids, scores): + this_peer_finished = True +@@ -2475,7 +2565,7 @@ class GenerationMixin: if return_dict_in_generate: if self.config.is_encoder_decoder: -- return GreedySearchEncoderDecoderOutput( -+ output_result = GreedySearchEncoderDecoderOutput( +- return GenerateEncoderDecoderOutput( ++ output_result = GenerateEncoderDecoderOutput( sequences=input_ids, scores=scores, - encoder_attentions=encoder_attentions, -@@ -2323,14 +2412,19 @@ class GenerationMixin: - decoder_hidden_states=decoder_hidden_states, + logits=raw_logits, +@@ -2487,7 +2577,7 @@ class GenerationMixin: + past_key_values=model_kwargs.get("past_key_values"), ) else: -- return GreedySearchDecoderOnlyOutput( -+ output_result = GreedySearchDecoderOnlyOutput( +- return GenerateDecoderOnlyOutput( ++ output_result = GenerateDecoderOnlyOutput( sequences=input_ids, scores=scores, - attentions=decoder_attentions, - hidden_states=decoder_hidden_states, + logits=raw_logits, +@@ -2496,7 +2586,12 @@ class GenerationMixin: + past_key_values=model_kwargs.get("past_key_values"), ) else: - return input_ids @@ -795,171 +820,179 @@ index ae12ae293..e6a08d367 100644 def sample( self, -@@ -2733,6 +2827,7 @@ class GenerationMixin: +@@ -2950,6 +3045,7 @@ class GenerationMixin: ['Wie alt bist du?'] ```""" # init values + latency_list = [] logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList() stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList() - if max_length is not None: -@@ -2795,6 +2890,7 @@ class GenerationMixin: + sequential = sequential if sequential is not None else self.generation_config.low_memory +@@ -3017,6 +3113,7 @@ class GenerationMixin: - this_peer_finished = False # used by synced_gpus only + decoder_prompt_len = input_ids.shape[-1] # record the prompt length of decoder while True: + tic = time.time() if synced_gpus: # Under synced_gpus the `forward` call must continue until all gpus complete their sequence. # The following logic allows an early break if all peers finished generating their sequence -@@ -2806,19 +2902,134 @@ class GenerationMixin: - break +@@ -3063,20 +3160,143 @@ class GenerationMixin: + ] - model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs) -- -- outputs = self( -- **model_inputs, -- return_dict=True, -- output_attentions=output_attentions, -- output_hidden_states=output_hidden_states, -- ) + outputs = stack_model_outputs(outputs_per_sub_batch) ++ if synced_gpus and this_peer_finished: ++ cur_len = cur_len + 1 ++ continue # don't waste resources running the code we don't need + +- else: # Unchanged original behavior +- outputs = self( +- **model_inputs, +- return_dict=True, +- output_attentions=output_attentions, +- output_hidden_states=output_hidden_states, +- ) - - if synced_gpus and this_peer_finished: - cur_len = cur_len + 1 - continue # don't waste resources running the code we don't need -- ++ next_token_logits = outputs.logits[:, -1, :] + - next_token_logits = outputs.logits[:, -1, :] -+ if re.search("GPTJ", self.config.architectures[0]) or re.search("llama", self.config.architectures[0], re.IGNORECASE) or re.search("chatglm", self.config.architectures[0], re.IGNORECASE): -+ if self.jit == False: -+ outputs = self( -+ **model_inputs, -+ return_dict=True, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ ) -+ if synced_gpus and this_peer_finished: -+ cur_len = cur_len + 1 -+ continue # don't waste resources running the code we don't need -+ next_token_logits = outputs.logits[:, -1, :] -+ else: -+ first_token = False -+ input_bs = input_ids.size()[0] -+ if model_inputs["past_key_values"] is None: -+ first_token = True -+ if first_token: -+ seq_len = input_ids.size()[1] -+ if re.search("GPTJ", self.config.architectures[0]): -+ # beam_idx_tmp=torch.zeros(int(batch_size * num_beams), dtype=torch.int) -+ # model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)]), torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)]), beam_idx_tmp) for i in range(self.config.n_layer)]) -+ model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)]), torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)])) for i in range(self.config.n_layer)]) -+ elif re.search("llama", self.config.architectures[0], re.IGNORECASE): -+ model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)]), torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)])) for i in range(self.config.num_hidden_layers)]) -+ elif re.search("chatglm", self.config.architectures[0], re.IGNORECASE): -+ model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)]), torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)])) for i in range(self.config.num_layers)]) -+ -+ model_inputs["attention_mask"] = model_inputs["attention_mask"][:1,:] -+ model_inputs["input_ids"] = model_inputs["input_ids"][:1,:] -+ model_inputs["position_ids"] = model_inputs["position_ids"][:1,:] -+ model_inputs["attention_mask"] = torch.cat([torch.zeros(1, 1), model_inputs["attention_mask"]], dim=-1) ++ else: # Unchanged original behavior ++ if re.search("GPTJ", self.config.architectures[0]) or re.search("llama", self.config.architectures[0], re.IGNORECASE) or re.search("chatglm", self.config.architectures[0], re.IGNORECASE): ++ if self.jit == False: ++ outputs = self( ++ **model_inputs, ++ return_dict=True, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ ) ++ if synced_gpus and this_peer_finished: ++ cur_len = cur_len + 1 ++ continue # don't waste resources running the code we don't need ++ next_token_logits = outputs.logits[:, -1, :] + else: -+ model_inputs["attention_mask"] = torch.cat([torch.zeros(input_bs, 1), model_inputs["attention_mask"]], dim=-1) -+ model_inputs.pop("use_cache", None) -+ model_inputs.pop("token_type_ids", None) ++ first_token = False ++ input_bs = input_ids.size()[0] ++ if model_inputs["past_key_values"] is None: ++ first_token = True ++ if first_token: ++ seq_len = input_ids.size()[1] ++ if re.search("GPTJ", self.config.architectures[0]): ++ # beam_idx_tmp=torch.zeros(int(batch_size * num_beams), dtype=torch.int) ++ # model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)]), torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)]), beam_idx_tmp) for i in range(self.config.n_layer)]) ++ model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)]), torch.zeros([1,int(self.config.n_head/self.tp_number),1,int(self.config.n_embd/self.config.n_head)])) for i in range(self.config.n_layer)]) ++ elif re.search("llama", self.config.architectures[0], re.IGNORECASE): ++ model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)]), torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)])) for i in range(self.config.num_hidden_layers)]) ++ elif re.search("chatglm", self.config.architectures[0], re.IGNORECASE): ++ model_inputs["past_key_values"] = tuple([(torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)]), torch.zeros([1,int(self.config.num_attention_heads/self.tp_number),1,int(self.config.hidden_size/self.config.num_attention_heads)])) for i in range(self.config.num_layers)]) + -+ if not hasattr(self, "trace_graph") and self.jit and self.ipex_int8: -+ print("load_int8_model") -+ self_jit = torch.jit.load(self.quantized_model_path) -+ self_jit = torch.jit.freeze(self_jit.eval()) -+ setattr(self, "trace_graph", self_jit) -+ if not hasattr(self,"trace_graph") and self.jit and not self.ipex_int8: ++ model_inputs["attention_mask"] = model_inputs["attention_mask"][:1,:] ++ model_inputs["input_ids"] = model_inputs["input_ids"][:1,:] ++ model_inputs["position_ids"] = model_inputs["position_ids"][:1,:] ++ model_inputs["attention_mask"] = torch.cat([torch.zeros(1, 1), model_inputs["attention_mask"]], dim=-1) ++ else: ++ model_inputs["attention_mask"] = torch.cat([torch.zeros(input_bs, 1), model_inputs["attention_mask"]], dim=-1) ++ model_inputs.pop("use_cache", None) ++ model_inputs.pop("token_type_ids", None) ++ ++ if not hasattr(self, "trace_graph") and self.jit and self.ipex_int8: ++ print("load_int8_model") ++ self_jit = torch.jit.load(self.quantized_model_path) ++ self_jit = torch.jit.freeze(self_jit.eval()) ++ setattr(self, "trace_graph", self_jit) ++ if not hasattr(self,"trace_graph") and self.jit and not self.ipex_int8: ++ if hasattr(self, "forward"): ++ sig = inspect.signature(self.forward) ++ else: ++ sig = inspect.signature(self.call) ++ example_inputs = tuple(model_inputs[key] for key in sig.parameters ++ if model_inputs.get(key, None) is not None and not isinstance(model_inputs.get(key, None), bool)) ++ self_jit = torch.jit.trace(self, example_inputs, strict=False) ++ self_jit = torch.jit.freeze(self_jit.eval()) ++ setattr(self, "trace_graph", self_jit) ++ outputs = self.trace_graph(**model_inputs) ++ if synced_gpus and this_peer_finished: ++ cur_len = cur_len + 1 ++ continue # don't waste resources running the code we don't need ++ if first_token: ++ outputs = list(outputs) ++ outputs[0] = outputs[0].expand(input_bs, -1, -1) ++ past_key_values = [] ++ for key, value in outputs[1]: ++ key_dim = key.dim() ++ value_dim = value.dim() ++ key = key.expand(input_bs, -1, -1, -1).contiguous() ++ value = value.expand(input_bs, -1, -1, -1).contiguous() ++ if key_dim == 3: ++ key = key.view(key.size(1) * key.size(0), key.size(2), key.size(3)) ++ if value_dim == 3: ++ value = value.view(value.size(1) * value.size(0), value.size(2), value.size(3)) ++ past_key_values.append(tuple([key, value])) ++ outputs[1] = tuple(past_key_values) ++ outputs = tuple(outputs) ++ if synced_gpus and this_peer_finished: ++ cur_len = cur_len + 1 ++ continue # don't waste resources running the code we don't need ++ next_token_logits = outputs[0][:, -1, :] ++ else: ++ if model_inputs["past_key_values"] is None or self.jit == False: ++ if re.search("T5", self.config.architectures[0]): ++ first_token = False ++ else: ++ first_token = model_inputs["input_ids"].size()[1] != 1 ++ if first_token: ++ input_bs = input_ids.size()[0] ++ seq_len = input_ids.size()[1] ++ model_inputs["attention_mask"] = model_inputs["attention_mask"][:1,:] ++ model_inputs["input_ids"] = model_inputs["input_ids"][:1,:] ++ outputs = self( ++ **model_inputs, ++ return_dict=True, ++ output_attentions=output_attentions, ++ output_hidden_states=output_hidden_states, ++ ) ++ if first_token: ++ outputs.logits = outputs.logits.expand(input_bs, seq_len, -1) ++ past_key_values = [] ++ for key, value in outputs["past_key_values"]: ++ key_dim = key.dim() ++ value_dim = value.dim() ++ key = key.expand(input_bs, -1, -1, -1).contiguous() ++ value = value.expand(input_bs, -1, -1, -1).contiguous() ++ if key_dim == 3: ++ key = key.view(key.size(1) * key.size(0), key.size(2), key.size(3)) ++ if value_dim == 3: ++ value = value.view(value.size(1) * value.size(0), value.size(2), value.size(3)) ++ past_key_values.append(tuple([key, value])) ++ outputs.past_key_values = tuple(past_key_values) ++ if synced_gpus and this_peer_finished: ++ cur_len = cur_len + 1 ++ continue # don't waste resources running the code we don't need ++ next_token_logits = outputs.logits[:, -1, :] ++ else: + if hasattr(self, "forward"): + sig = inspect.signature(self.forward) + else: + sig = inspect.signature(self.call) + example_inputs = tuple(model_inputs[key] for key in sig.parameters + if model_inputs.get(key, None) is not None and not isinstance(model_inputs.get(key, None), bool)) -+ self_jit = torch.jit.trace(self, example_inputs, strict=False) -+ self_jit = torch.jit.freeze(self_jit.eval()) -+ setattr(self, "trace_graph", self_jit) -+ outputs = self.trace_graph(**model_inputs) -+ if synced_gpus and this_peer_finished: -+ cur_len = cur_len + 1 -+ continue # don't waste resources running the code we don't need -+ if first_token: -+ outputs = list(outputs) -+ outputs[0] = outputs[0].expand(input_bs, -1, -1) -+ past_key_values = [] -+ for key, value in outputs[1]: -+ key_dim = key.dim() -+ value_dim = value.dim() -+ key = key.expand(input_bs, -1, -1, -1).contiguous() -+ value = value.expand(input_bs, -1, -1, -1).contiguous() -+ if key_dim == 3: -+ key = key.view(key.size(1) * key.size(0), key.size(2), key.size(3)) -+ if value_dim == 3: -+ value = value.view(value.size(1) * value.size(0), value.size(2), value.size(3)) -+ past_key_values.append(tuple([key, value])) -+ outputs[1] = tuple(past_key_values) -+ outputs = tuple(outputs) -+ if synced_gpus and this_peer_finished: -+ cur_len = cur_len + 1 -+ continue # don't waste resources running the code we don't need -+ next_token_logits = outputs[0][:, -1, :] -+ else: -+ if model_inputs["past_key_values"] is None or self.jit == False: -+ if re.search("T5", self.config.architectures[0]): -+ first_token = False -+ else: -+ first_token = model_inputs["input_ids"].size()[1] != 1 -+ if first_token: -+ input_bs = input_ids.size()[0] -+ seq_len = input_ids.size()[1] -+ model_inputs["attention_mask"] = model_inputs["attention_mask"][:1,:] -+ model_inputs["input_ids"] = model_inputs["input_ids"][:1,:] -+ outputs = self( -+ **model_inputs, -+ return_dict=True, -+ output_attentions=output_attentions, -+ output_hidden_states=output_hidden_states, -+ ) -+ if first_token: -+ outputs.logits = outputs.logits.expand(input_bs, seq_len, -1) -+ past_key_values = [] -+ for key, value in outputs["past_key_values"]: -+ key_dim = key.dim() -+ value_dim = value.dim() -+ key = key.expand(input_bs, -1, -1, -1).contiguous() -+ value = value.expand(input_bs, -1, -1, -1).contiguous() -+ if key_dim == 3: -+ key = key.view(key.size(1) * key.size(0), key.size(2), key.size(3)) -+ if value_dim == 3: -+ value = value.view(value.size(1) * value.size(0), value.size(2), value.size(3)) -+ past_key_values.append(tuple([key, value])) -+ outputs.past_key_values = tuple(past_key_values) -+ if synced_gpus and this_peer_finished: -+ cur_len = cur_len + 1 -+ continue # don't waste resources running the code we don't need -+ next_token_logits = outputs.logits[:, -1, :] -+ else: -+ if hasattr(self, "forward"): -+ sig = inspect.signature(self.forward) -+ else: -+ sig = inspect.signature(self.call) -+ example_inputs = tuple(model_inputs[key] for key in sig.parameters -+ if model_inputs.get(key, None) is not None and not isinstance(model_inputs.get(key, None), bool)) -+ if not hasattr(self,"trace_graph") and self.jit and not self.ipex_int8: -+ self_jit = torch.jit.trace(self, example_inputs, strict=False) -+ self_jit = torch.jit.freeze(self_jit.eval()) -+ setattr(self, "trace_graph", self_jit) ++ if not hasattr(self,"trace_graph") and self.jit and not self.ipex_int8: ++ self_jit = torch.jit.trace(self, example_inputs, strict=False) ++ self_jit = torch.jit.freeze(self_jit.eval()) ++ setattr(self, "trace_graph", self_jit) + -+ outputs = self.trace_graph(*example_inputs) -+ if synced_gpus and this_peer_finished: -+ cur_len = cur_len + 1 -+ continue # don't waste resources running the code we don't need -+ next_token_logits = outputs[0][:, -1, :] - # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` - # cannot be generated both before and after the `nn.functional.log_softmax` operation. - next_token_logits = self.adjust_logits_during_generation(next_token_logits, cur_len=cur_len) -@@ -2887,6 +3098,7 @@ class GenerationMixin: ++ outputs = self.trace_graph(*example_inputs) ++ if synced_gpus and this_peer_finished: ++ cur_len = cur_len + 1 ++ continue # don't waste resources running the code we don't need ++ next_token_logits = outputs[0][:, -1, :] ++ # hack: adjust tokens for Marian. For Marian we have to make sure that the `pad_token_id` ++ # cannot be generated both before and after the `nn.functional.log_softmax` operation. + next_token_scores = nn.functional.log_softmax( + next_token_logits, dim=-1 + ) # (batch_size * num_beams, vocab_size) +@@ -3149,6 +3369,7 @@ class GenerationMixin: # increase cur_len cur_len = cur_len + 1 @@ -967,26 +1000,26 @@ index ae12ae293..e6a08d367 100644 if beam_scorer.is_done or stopping_criteria(input_ids, scores): if not synced_gpus: -@@ -2910,7 +3122,7 @@ class GenerationMixin: +@@ -3173,7 +3394,7 @@ class GenerationMixin: sequence_outputs["sequence_scores"] = None if self.config.is_encoder_decoder: -- return BeamSearchEncoderDecoderOutput( -+ output_result = BeamSearchEncoderDecoderOutput( +- return GenerateBeamEncoderDecoderOutput( ++ output_result = GenerateBeamEncoderDecoderOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, -@@ -2922,7 +3134,7 @@ class GenerationMixin: - decoder_hidden_states=decoder_hidden_states, +@@ -3187,7 +3408,7 @@ class GenerationMixin: + past_key_values=model_kwargs.get("past_key_values"), ) else: -- return BeamSearchDecoderOnlyOutput( -+ output_result = BeamSearchDecoderOnlyOutput( +- return GenerateBeamDecoderOnlyOutput( ++ output_result = GenerateBeamDecoderOnlyOutput( sequences=sequence_outputs["sequences"], sequences_scores=sequence_outputs["sequence_scores"], scores=scores, -@@ -2931,7 +3143,9 @@ class GenerationMixin: - hidden_states=decoder_hidden_states, +@@ -3198,7 +3419,9 @@ class GenerationMixin: + past_key_values=model_kwargs.get("past_key_values"), ) else: - return sequence_outputs["sequences"] @@ -997,15 +1030,21 @@ index ae12ae293..e6a08d367 100644 def beam_sample( self, diff --git a/src/transformers/modeling_utils.py b/src/transformers/modeling_utils.py -index 0df8d7e25..d14c9eede 100644 +index b3102a37d..a3d232b1d 100644 --- a/src/transformers/modeling_utils.py +++ b/src/transformers/modeling_utils.py -@@ -298,8 +298,13 @@ def shard_checkpoint( - current_block_size = 0 +@@ -362,6 +362,7 @@ def shard_checkpoint( total_size = 0 + storage_id_to_block = {} + import io for key, weight in state_dict.items(): + # when bnb serialization is used the weights in the state dict can be strings + # check: https://github.com/huggingface/transformers/pull/24416 for more details +@@ -376,7 +377,11 @@ def shard_checkpoint( + sharded_state_dicts[block_id][key] = weight + continue + - weight_size = weight.numel() * dtype_byte_size(weight.dtype) + if isinstance(weight, io.BytesIO): + # FP8 has extra state with io.BytesIO @@ -1013,9 +1052,9 @@ index 0df8d7e25..d14c9eede 100644 + else: + weight_size = weight.numel() * dtype_byte_size(weight.dtype) - # If this weight is going to tip up over the maximal size, we split. - if current_block_size + weight_size > max_shard_size: -@@ -1831,7 +1836,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix + # If this weight is going to tip up over the maximal size, we split, but only if we have put at least one + # weight in the current shard. +@@ -2438,7 +2443,10 @@ class PreTrainedModel(nn.Module, ModuleUtilsMixin, GenerationMixin, PushToHubMix and is_main_process and reg.fullmatch(filename_no_suffix) is not None ): @@ -1028,10 +1067,10 @@ index 0df8d7e25..d14c9eede 100644 # Save the model for shard_file, shard in shards.items(): diff --git a/src/transformers/models/bert/modeling_bert.py b/src/transformers/models/bert/modeling_bert.py -index ee7dcd5e4..5ead9c989 100755 +index 4c068c4d4..ca4c33cbf 100755 --- a/src/transformers/models/bert/modeling_bert.py +++ b/src/transformers/models/bert/modeling_bert.py -@@ -346,6 +346,8 @@ class BertSelfAttention(nn.Module): +@@ -348,6 +348,8 @@ class BertSelfAttention(nn.Module): attention_scores = attention_scores / math.sqrt(self.attention_head_size) if attention_mask is not None: @@ -1040,7 +1079,7 @@ index ee7dcd5e4..5ead9c989 100755 # Apply the attention mask is (precomputed for all layers in BertModel forward() function) attention_scores = attention_scores + attention_mask -@@ -1063,6 +1065,7 @@ class BertForPreTraining(BertPreTrainedModel): +@@ -1056,6 +1058,7 @@ class BertForPreTraining(BertPreTrainedModel): # Initialize weights and apply final processing self.post_init() @@ -1048,7 +1087,7 @@ index ee7dcd5e4..5ead9c989 100755 def get_output_embeddings(self): return self.cls.predictions.decoder -@@ -1133,12 +1136,24 @@ class BertForPreTraining(BertPreTrainedModel): +@@ -1126,12 +1129,24 @@ class BertForPreTraining(BertPreTrainedModel): ) sequence_output, pooled_output = outputs[:2] @@ -1075,10 +1114,10 @@ index ee7dcd5e4..5ead9c989 100755 total_loss = masked_lm_loss + next_sentence_loss diff --git a/src/transformers/models/distilbert/modeling_distilbert.py b/src/transformers/models/distilbert/modeling_distilbert.py -index 84db89e0f..cb986d7ea 100755 +index 481e4c427..d2ac0b3aa 100755 --- a/src/transformers/models/distilbert/modeling_distilbert.py +++ b/src/transformers/models/distilbert/modeling_distilbert.py -@@ -216,12 +216,11 @@ class MultiHeadSelfAttention(nn.Module): +@@ -239,12 +239,11 @@ class MultiHeadSelfAttention(nn.Module): k = shape(self.k_lin(key)) # (bs, n_heads, k_length, dim_per_head) v = shape(self.v_lin(value)) # (bs, n_heads, k_length, dim_per_head) @@ -1094,7 +1133,7 @@ index 84db89e0f..cb986d7ea 100755 weights = nn.functional.softmax(scores, dim=-1) # (bs, n_heads, q_length, k_length) weights = self.dropout(weights) # (bs, n_heads, q_length, k_length) -@@ -743,11 +742,11 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): +@@ -982,11 +981,11 @@ class DistilBertForSequenceClassification(DistilBertPreTrainedModel): ) def forward( self, @@ -1108,7 +1147,7 @@ index 84db89e0f..cb986d7ea 100755 output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, diff --git a/src/transformers/models/vit/modeling_vit.py b/src/transformers/models/vit/modeling_vit.py -index 474b92f72..094ea6eb0 100644 +index 734ccf6a9..5914d4e2b 100644 --- a/src/transformers/models/vit/modeling_vit.py +++ b/src/transformers/models/vit/modeling_vit.py @@ -129,14 +129,14 @@ class ViTEmbeddings(nn.Module): @@ -1128,7 +1167,7 @@ index 474b92f72..094ea6eb0 100644 embeddings = self.dropout(embeddings) -@@ -779,8 +779,8 @@ class ViTForImageClassification(ViTPreTrainedModel): +@@ -776,8 +776,8 @@ class ViTForImageClassification(ViTPreTrainedModel): def forward( self, pixel_values: Optional[torch.Tensor] = None, @@ -1139,16 +1178,18 @@ index 474b92f72..094ea6eb0 100644 output_hidden_states: Optional[bool] = None, interpolate_pos_encoding: Optional[bool] = None, diff --git a/src/transformers/trainer.py b/src/transformers/trainer.py -index cf71499b0..0ff7236bd 100755 +index a2436dadc..23773d49a 100755 --- a/src/transformers/trainer.py +++ b/src/transformers/trainer.py -@@ -158,6 +158,29 @@ from .utils import ( +@@ -152,6 +152,31 @@ from .utils import ( + strtobool, ) - from .utils.generic import ContextManagers - + from .utils.quantization_config import QuantizationMethod ++from tqdm import tqdm ++ +def trace_handler(prof): + print(prof.key_averages().table( -+ sort_by="self_cpu_time_total", row_limit=10)) ++ sort_by="self_cpu_time_total", row_limit=-1)) + import datetime + now = datetime.datetime.now() + log_path = os.path.join(os.getcwd(), "vit_profiling_{}_step_{}.json".format(now.strftime("%Y%m%d%H%M%S"), str(prof.step_num))) @@ -1170,45 +1211,40 @@ index cf71499b0..0ff7236bd 100755 + with_modules=True + ) - _is_native_cpu_amp_available = is_torch_greater_or_equal_than_1_10 -@@ -336,7 +359,7 @@ class Trainer: - self.hp_name = None - self.deepspeed = None - self.is_in_train = False -- + DEFAULT_CALLBACKS = [DefaultFlowCallback] +@@ -366,6 +391,7 @@ class Trainer: + + self.create_accelerator_and_postprocess() + + self.fp16_scaler = None # memory metrics - must set up as early as possible self._memory_tracker = TrainerMemoryTracker(self.args.skip_memory_metrics) self._memory_tracker.start() -@@ -606,7 +629,7 @@ class Trainer: +@@ -592,7 +618,7 @@ class Trainer: + f"FP16 provided in SM_HP_MP_PARAMETERS is {smp.state.cfg.fp16}, " "but SageMaker Model Parallelism < 1.10 does not support FP16 in trainer." ) - -- if args.fp16 or args.bf16: -+ if args.fp16 or args.bf16 or args.fp16_cpu: - if args.half_precision_backend == "auto": - if args.device == torch.device("cpu"): - if args.fp16: -@@ -621,7 +644,7 @@ class Trainer: +- if (args.fp16 or args.bf16) and args.half_precision_backend == "auto": ++ if (args.fp16 or args.bf16 or args.fp16_cpu) and args.half_precision_backend == "auto": + if args.device == torch.device("cpu"): + if args.fp16: + raise ValueError("Tried to use `fp16` but it is not supported on cpu") +@@ -600,11 +626,11 @@ class Trainer: + args.half_precision_backend = "cpu_amp" logger.info(f"Using {args.half_precision_backend} half precision backend") - self.do_grad_scaling = False -- if (args.fp16 or args.bf16) and not (args.deepspeed or is_sagemaker_mp_enabled()): -+ if (args.fp16 or args.bf16 or args.fp16_cpu) and not (args.deepspeed or is_sagemaker_mp_enabled()): +- if (args.fp16 or args.bf16) and not (self.is_deepspeed_enabled or is_sagemaker_mp_enabled()): ++ if (args.fp16 or args.bf16 or args.fp16_cpu) and not (self.is_deepspeed_enabled or is_sagemaker_mp_enabled()): # deepspeed and SageMaker Model Parallel manage their own half precision - if args.half_precision_backend == "cuda_amp": - self.use_cuda_amp = True -@@ -645,7 +668,7 @@ class Trainer: - self.scaler = torch.cuda.amp.GradScaler() - elif args.half_precision_backend == "cpu_amp": + if args.half_precision_backend == "cpu_amp": self.use_cpu_amp = True - self.amp_dtype = torch.bfloat16 + self.amp_dtype = torch.bfloat16 if not args.fp16_cpu else torch.half - else: + elif args.half_precision_backend == "apex": if not is_apex_available(): raise ImportError( -@@ -695,7 +718,7 @@ class Trainer: +@@ -642,7 +668,7 @@ class Trainer: self._memory_tracker.stop_and_update_metrics() # torch.compile @@ -1216,8 +1252,8 @@ index cf71499b0..0ff7236bd 100755 + if args.inductor and not is_torch_compile_available(): raise RuntimeError("Using torch.compile requires PyTorch 2.0 or higher.") - def add_callback(self, callback): -@@ -1323,15 +1346,45 @@ class Trainer: + self.is_fsdp_xla_v2_enabled = args.fsdp_config["xla_fsdp_v2"] +@@ -1306,13 +1332,39 @@ class Trainer: return model def torch_jit_model_eval(self, model, dataloader, training=False): @@ -1230,7 +1266,7 @@ index cf71499b0..0ff7236bd 100755 example_batch = self._prepare_inputs(example_batch) + int8_inputs=[] + if (self.args.int8 or self.args.do_calibration) and self.args.use_ipex: -+ import intel_extension_for_pytorch as ipex ++ import intel_extension_for_pytorch as ipex + from intel_extension_for_pytorch.quantization import prepare, convert + from torch.ao.quantization import MinMaxObserver, PerChannelMinMaxObserver, QConfig + qconfig = QConfig(activation=MinMaxObserver.with_args(qscheme=torch.per_tensor_affine, dtype=torch.quint8), weight=PerChannelMinMaxObserver.with_args(dtype=torch.qint8, qscheme=torch.per_channel_symmetric)) @@ -1254,16 +1290,21 @@ index cf71499b0..0ff7236bd 100755 try: + if self.args.int8: + model = prepared_model - jit_model = model.eval() - with ContextManagers([self.autocast_smart_context_manager(cache_enabled=False), torch.no_grad()]): + jit_model = copy.copy(model) + jit_model.eval() + original_forward = jit_model.__dict__.pop("_original_forward", None) +@@ -1320,6 +1372,10 @@ class Trainer: + if original_forward: + jit_model.forward = original_forward + with self.accelerator.autocast(cache_enabled=False), torch.no_grad(): + if self.args.int8 and self.args.use_ipex: + jit_model = convert(jit_model) + if self.args.smooth_quant: + jit_model(*int8_inputs) - if version.parse(version.parse(torch.__version__).base_version) >= version.parse("1.14.0"): + if version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.0.0"): if isinstance(example_batch, dict): jit_model = torch.jit.trace(jit_model, example_kwarg_inputs=example_batch, strict=False) -@@ -1361,6 +1414,7 @@ class Trainer: +@@ -1348,6 +1404,7 @@ class Trainer: return model def ipex_optimize_model(self, model, training=False, dtype=torch.float32): @@ -1271,7 +1312,7 @@ index cf71499b0..0ff7236bd 100755 if not is_ipex_available(): raise ImportError( "Using IPEX but IPEX is not installed or IPEX's version does not match current PyTorch, please refer" -@@ -1372,22 +1426,40 @@ class Trainer: +@@ -1359,22 +1416,40 @@ class Trainer: if not training: model.eval() dtype = torch.bfloat16 if not self.is_in_train and self.args.bf16_full_eval else dtype @@ -1319,7 +1360,7 @@ index cf71499b0..0ff7236bd 100755 if is_sagemaker_mp_enabled(): # Wrapping the base model twice in a DistributedModel will raise an error. if isinstance(self.model_wrapped, smp.model.DistributedModel): -@@ -1415,6 +1487,74 @@ class Trainer: +@@ -1398,6 +1473,74 @@ class Trainer: model = self.torch_jit_model_eval(model, dataloader, training) self.jit_compilation_time = round(time.time() - start_time, 4) @@ -1337,7 +1378,7 @@ index cf71499b0..0ff7236bd 100755 + from torch.ao.quantization.quantize_pt2e import prepare_pt2e, convert_pt2e + import torch.ao.quantization.quantizer.x86_inductor_quantizer as xiq + from torch.ao.quantization.quantizer.x86_inductor_quantizer import X86InductorQuantizer -+ from torch._export import capture_pre_autograd_graph ++ from torch._export import capture_pre_autograd_graph, dynamic_dim + print('[Info] Running torch.compile() INT8 quantization') + with torch.no_grad(): + exported_model = capture_pre_autograd_graph( @@ -1394,49 +1435,45 @@ index cf71499b0..0ff7236bd 100755 # Note: in torch.distributed mode, there's no point in wrapping the model # inside a DistributedDataParallel as we'll be under `no_grad` anyways. if not training: -@@ -1575,8 +1715,9 @@ class Trainer: +@@ -1517,6 +1660,9 @@ class Trainer: + kwargs["broadcast_buffers"] = self.args.ddp_broadcast_buffers - # torch.compile() needs to be called after wrapping the model with FSDP or DDP - # to ensure that it accounts for the graph breaks required by those wrappers -- if self.args.torch_compile: -- model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode) + self.accelerator.ddp_handler = DistributedDataParallelKwargs(**kwargs) + if self.args.inductor: + with torch.cpu.amp.autocast(enabled=self.args.bf16 or self.args.fp16_cpu, dtype=torch.half if self.args.fp16_cpu else torch.bfloat16): + model = torch.compile(model, backend=self.args.torch_compile_backend, mode=self.args.torch_compile_mode) return model -@@ -1993,7 +2134,11 @@ class Trainer: - scale_after = self.scaler.get_scale() - optimizer_was_run = scale_before <= scale_after - else: -- self.optimizer.step() -+ if self.args.fp16_cpu: -+ self.fp16_scaler.step(self.optimizer) -+ self.fp16_scaler.update() -+ else: -+ self.optimizer.step() +@@ -2014,7 +2160,11 @@ class Trainer: + grad_norm = _grad_norm.item() if _grad_norm is not None else None - if optimizer_was_run and not self.deepspeed: - self.lr_scheduler.step() -@@ -2714,7 +2859,10 @@ class Trainer: - # loss gets scaled under gradient_accumulation_steps in deepspeed - loss = self.deepspeed.backward(loss) + # Optimizer step +- self.optimizer.step() ++ if self.args.fp16_cpu: ++ self.fp16_scaler.step(self.optimizer) ++ self.fp16_scaler.update() ++ else: ++ self.optimizer.step() + optimizer_was_run = not self.accelerator.optimizer_step_was_skipped + if optimizer_was_run: + # Delay optimizer scheduling until metrics are generated +@@ -2908,7 +3058,10 @@ class Trainer: + with amp.scale_loss(loss, self.optimizer) as scaled_loss: + scaled_loss.backward() else: -- loss.backward() +- self.accelerator.backward(loss) + if self.args.fp16_cpu: + self.fp16_scaler.scale(loss).backward() + else: -+ loss.backward() ++ self.accelerator.backward(loss) - return loss.detach() + return loss.detach() / self.args.gradient_accumulation_steps -@@ -3085,7 +3233,65 @@ class Trainer: - self._memory_tracker.stop_and_update_metrics(output.metrics) +@@ -3322,6 +3475,66 @@ class Trainer: return PredictionOutput(predictions=output.predictions, label_ids=output.label_ids, metrics=output.metrics) -- -+ + + def benchmark_evaluate(self, model, dataloader): + steps_per_epoch = len(dataloader) + total_steps = (self.args.perf_run_iters + self.args.perf_begin_iter) @@ -1445,18 +1482,25 @@ index cf71499b0..0ff7236bd 100755 + i = 0; + timeBuff = [] + import time -+ # with torch.profiler.profile( -+ # activities=[ -+ # torch.profiler.ProfilerActivity.CPU], -+ # schedule=torch.profiler.schedule( -+ # wait=1, -+ # warmup=9, -+ # active=5), -+ # on_trace_ready=trace_handler -+ # ) as prof: ++ if self.args.profile: ++ batch = next(iter(dataloader)) ++ if 'pixel_values' in batch: ++ if self.args.fp16_cpu: ++ batch['pixel_values'] = batch['pixel_values'].to(torch.half) ++ elif self.args.bf16 or self.args.int8_bf16: ++ batch['pixel_values'] = batch['pixel_values'].to(torch.bfloat16) ++ ++ prof = profile_ctx.__enter__() ++ with torch.no_grad(): ++ for i in range(40): ++ if (self.args.bf16 or self.args.int8_bf16 or self.args.fp16_cpu) and self.args.inductor: ++ with torch.cpu.amp.autocast(dtype=torch.half if self.args.fp16_cpu else torch.bfloat16): ++ outputs = model(**batch) ++ else: ++ outputs = model(**batch) ++ prof.step() ++ prof.__exit__(None, None, None) + with tqdm(total=total_steps, desc="Evaluating") as pbar: -+ if self.args.profile: -+ prof = profile_ctx.__enter__() + for epoch in range(test_epoches + 1): + for it, batch in enumerate(dataloader): + if 'pixel_values' in batch: @@ -1481,24 +1525,19 @@ index cf71499b0..0ff7236bd 100755 + with torch.cpu.amp.autocast(dtype=torch.half if self.args.fp16_cpu else torch.bfloat16): + start = time.time() + outputs = model(**batch) -+ #prof.step() + end = time.time() + else: + start = time.time() + outputs = model(**batch) -+ #prof.step() + end = time.time() + if epoch * steps_per_epoch + it > self.args.perf_begin_iter: + timeBuff.append(end-start) + pbar.update(1) -+ if self.args.profile: -+ prof.step() -+ if self.args.profile: -+ profile_ctx.__exit__(None, None, None) ++ def evaluation_loop( self, dataloader: DataLoader, -@@ -3133,7 +3339,12 @@ class Trainer: +@@ -3380,7 +3593,12 @@ class Trainer: logger.info(" Num examples: Unknown") logger.info(f" Batch size = {batch_size}") @@ -1512,7 +1551,7 @@ index cf71499b0..0ff7236bd 100755 self.callback_handler.eval_dataloader = dataloader # Do this before wrapping. -@@ -3158,6 +3369,20 @@ class Trainer: +@@ -3402,6 +3620,20 @@ class Trainer: all_labels = None all_inputs = None # Will be useful when we have an iterable dataset so don't know its length. @@ -1534,10 +1573,10 @@ index cf71499b0..0ff7236bd 100755 observed_num_examples = 0 # Main evaluation loop diff --git a/src/transformers/training_args.py b/src/transformers/training_args.py -index 088eb06b7..887cfec22 100644 +index 19ab24c20..b19eea005 100644 --- a/src/transformers/training_args.py +++ b/src/transformers/training_args.py -@@ -777,10 +777,121 @@ class TrainingArguments: +@@ -930,10 +930,121 @@ class TrainingArguments: ) }, ) @@ -1659,18 +1698,17 @@ index 088eb06b7..887cfec22 100644 fp16_opt_level: str = field( default="O1", metadata={ -@@ -963,7 +1074,9 @@ class TrainingArguments: - label_smoothing_factor: float = field( +@@ -1139,6 +1250,9 @@ class TrainingArguments: default=0.0, metadata={"help": "The label smoothing epsilon to apply (zero means no label smoothing)."} ) -- + + profile: bool = field( + default=False, metadata={"help": "enable profile"} + ) - default_optim = "adamw_hf" + default_optim = "adamw_torch" # XXX: enable when pytorch==2.0.1 comes out - we want to give it time to get all the bugs sorted out # if is_torch_available() and version.parse(version.parse(torch.__version__).base_version) >= version.parse("2.1.0"): -@@ -1253,17 +1366,17 @@ class TrainingArguments: +@@ -1522,19 +1636,19 @@ class TrainingArguments: if version.parse(version.parse(torch.__version__).base_version) == version.parse("2.0.0") and self.fp16: raise ValueError("--optim adamw_torch_fused with --fp16 requires PyTorch>2.0") @@ -1678,24 +1716,28 @@ index 088eb06b7..887cfec22 100644 - self.framework == "pt" - and is_torch_available() - and (self.device.type != "cuda") +- and (self.device.type != "npu") +- and (self.device.type != "xpu") - and (get_xla_device_type(self.device) != "GPU") - and (self.fp16 or self.fp16_full_eval) - ): - raise ValueError( - "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation" -- " (`--fp16_full_eval`) can only be used on CUDA devices." +- " (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX)." - ) -+ #if ( -+ # self.framework == "pt" -+ # and is_torch_available() -+ # and (self.device.type != "cuda") -+ # and (get_xla_device_type(self.device) != "GPU") -+ # and (self.fp16 or self.fp16_full_eval) -+ #): -+ # raise ValueError( -+ # "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation" -+ # " (`--fp16_full_eval`) can only be used on CUDA devices." -+ # ) ++ # if ( ++ # self.framework == "pt" ++ # and is_torch_available() ++ # and (self.device.type != "cuda") ++ # and (self.device.type != "npu") ++ # and (self.device.type != "xpu") ++ # and (get_xla_device_type(self.device) != "GPU") ++ # and (self.fp16 or self.fp16_full_eval) ++ # ): ++ # raise ValueError( ++ # "FP16 Mixed precision training with AMP or APEX (`--fp16`) and FP16 half precision evaluation" ++ # " (`--fp16_full_eval`) can only be used on CUDA or NPU devices or certain XPU devices (with IPEX)." ++ # ) if ( self.framework == "pt" diff --git a/models_v2/pytorch/bert_large/inference/cpu/setup.sh b/models_v2/pytorch/bert_large/inference/cpu/setup.sh index a133943c8..6a1790f89 100755 --- a/models_v2/pytorch/bert_large/inference/cpu/setup.sh +++ b/models_v2/pytorch/bert_large/inference/cpu/setup.sh @@ -31,7 +31,7 @@ fi rm -rf transformers git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git lfs pull git apply ../../../../common/enable_ipex_for_transformers.diff pip install -e ./ diff --git a/models_v2/pytorch/bert_large/training/cpu/setup.sh b/models_v2/pytorch/bert_large/training/cpu/setup.sh index 19a45f68e..d4eb37613 100755 --- a/models_v2/pytorch/bert_large/training/cpu/setup.sh +++ b/models_v2/pytorch/bert_large/training/cpu/setup.sh @@ -35,7 +35,7 @@ fi rm -rf transformers git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git lfs pull git apply ../../../../common/enable_ipex_for_transformers.diff pip install -e ./ diff --git a/models_v2/pytorch/chatglm/inference/cpu/setup.sh b/models_v2/pytorch/chatglm/inference/cpu/setup.sh index 3ca1bb4b0..61fea5f24 100755 --- a/models_v2/pytorch/chatglm/inference/cpu/setup.sh +++ b/models_v2/pytorch/chatglm/inference/cpu/setup.sh @@ -22,7 +22,7 @@ pip install dcpm-kernels git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git apply ../../../../enable_ipex_for_transformers.diff pip install -e ./ cd .. diff --git a/models_v2/pytorch/distilbert/inference/cpu/setup.sh b/models_v2/pytorch/distilbert/inference/cpu/setup.sh index 05faabfc1..8f0ed4529 100755 --- a/models_v2/pytorch/distilbert/inference/cpu/setup.sh +++ b/models_v2/pytorch/distilbert/inference/cpu/setup.sh @@ -32,7 +32,7 @@ fi rm -rf transformers git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git lfs pull pip install -r examples/pytorch/text-classification/requirements.txt git apply ../../../../common/enable_ipex_for_transformers.diff diff --git a/models_v2/pytorch/gptj/inference/cpu/setup.sh b/models_v2/pytorch/gptj/inference/cpu/setup.sh index 2903bf8d3..a0fc3e72e 100755 --- a/models_v2/pytorch/gptj/inference/cpu/setup.sh +++ b/models_v2/pytorch/gptj/inference/cpu/setup.sh @@ -20,7 +20,7 @@ cd ${MODEL_DIR} git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git apply ${MODEL_DIR}/../../../../common/enable_ipex_for_transformers.diff pip install -e ./ cd .. diff --git a/models_v2/pytorch/llama/inference/cpu/setup.sh b/models_v2/pytorch/llama/inference/cpu/setup.sh index bcfac79a3..d29ba6f78 100755 --- a/models_v2/pytorch/llama/inference/cpu/setup.sh +++ b/models_v2/pytorch/llama/inference/cpu/setup.sh @@ -22,7 +22,7 @@ pip install datasets sentencepiece psutil cd ${MODEL_DIR} git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git apply ${MODEL_DIR}/../../../../common/enable_ipex_for_transformers.diff pip install -e ./ cd .. diff --git a/models_v2/pytorch/llama/training/cpu/setup.sh b/models_v2/pytorch/llama/training/cpu/setup.sh index e6b9cd372..27ecea249 100755 --- a/models_v2/pytorch/llama/training/cpu/setup.sh +++ b/models_v2/pytorch/llama/training/cpu/setup.sh @@ -23,7 +23,7 @@ pip install protobuf==3.20.3 numpy==1.20 cd ${MODEL_DIR} git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 git apply ${MODEL_DIR}/../../../../common/enable_ipex_for_transformers.diff pip install -e ./ cd .. diff --git a/models_v2/pytorch/vit/inference/cpu/setup.sh b/models_v2/pytorch/vit/inference/cpu/setup.sh index bf8208d81..92e920ea2 100755 --- a/models_v2/pytorch/vit/inference/cpu/setup.sh +++ b/models_v2/pytorch/vit/inference/cpu/setup.sh @@ -19,7 +19,7 @@ # Clone the Transformers repo in the VIT Base inference directory git clone https://github.com/huggingface/transformers.git cd transformers -git checkout v4.28.1 +git checkout v4.38.1 pip install -r examples/pytorch/image-classification/requirements.txt pip install cchardet pip install scikit-learn