From 9e95a89fb65640dec493f78869464b1cbaec3586 Mon Sep 17 00:00:00 2001
From: Libin Tang <litang@habana.ai>
Date: Thu, 22 Aug 2024 09:34:10 -0700
Subject: [PATCH 01/41] Fix profiling step with device finish execution for
 text-generation (#1283)

---
 optimum/habana/transformers/generation/utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index 284f646a48..aa342b392e 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -2032,8 +2032,6 @@ def _contrastive_search(
                 self._pad_past_key_values(model_kwargs)
                 model_kwargs["pad_done"] = True
 
-            hb_profer.step()
-
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -2041,6 +2039,7 @@ def _contrastive_search(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
+            hb_profer.step()
 
         if (
             model_kwargs.get("use_hpu_graphs", False)
@@ -2366,7 +2365,6 @@ def _sample(
                 )
                 this_peer_finished = unfinished_sequences.max() == 0
 
-            hb_profer.step()
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -2374,6 +2372,7 @@ def _sample(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
+            hb_profer.step()
 
             if (
                 not model_kwargs.get("pad_done", False)
@@ -3628,7 +3627,6 @@ def _assisted_decoding(
                 )
                 this_peer_finished = unfinished_sequences.max() == 0
 
-            hb_profer.step()
             if hb_gen_time is not None:
                 if not time_to_first_token_done:
                     time_to_first_token_done = True
@@ -3636,6 +3634,7 @@ def _assisted_decoding(
 
                     torch_hpu.synchronize()
                 hb_gen_time.step()
+            hb_profer.step()
 
             if this_peer_finished and not synced_gpus:
                 break

From d427f1fabc1766db3534436ae3b06c4ffa4effc3 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Thu, 22 Aug 2024 09:39:42 -0700
Subject: [PATCH 02/41] Revert mark_step in mixtral model from PR #1260 (#1273)

---
 .../transformers/models/mixtral/modeling_mixtral.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
index fc414e6d76..43dfc7e48a 100644
--- a/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
+++ b/optimum/habana/transformers/models/mixtral/modeling_mixtral.py
@@ -471,7 +471,6 @@ def forward(
         reuse_cache: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
         cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
         **kwargs,
     ) -> Tuple[torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]]:
         """
@@ -481,10 +480,7 @@ def forward(
         - add new args reuse_cache
         - add new args flash_attention_recompute
         - add new args cache_idx
-        - add new args lazy_mode
         """
-        if lazy_mode:
-            htcore.mark_step()
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
@@ -504,16 +500,12 @@ def forward(
             cache_idx=cache_idx,
         )
         hidden_states = residual + hidden_states
-        if lazy_mode:
-            htcore.mark_step()
 
         # Fully Connected
         residual = hidden_states
         hidden_states = self.post_attention_layernorm(hidden_states)
         hidden_states, router_logits = self.block_sparse_moe(hidden_states)
         hidden_states = residual + hidden_states
-        if lazy_mode:
-            htcore.mark_step()
 
         outputs = (hidden_states,)
 
@@ -554,7 +546,6 @@ def forward(
         reuse_cache: Optional[bool] = False,
         flash_attention_recompute: Optional[bool] = False,
         cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
     ) -> Union[Tuple, MoeModelOutputWithPast]:
         """
         Copied from MixtralModel.forward: https://github.com/huggingface/transformers/blob/v4.37.0/src/transformers/models/mixtral/modeling_mixtral.py#L1069
@@ -684,7 +675,6 @@ def forward(
                     reuse_cache=reuse_cache,
                     flash_attention_recompute=flash_attention_recompute,
                     cache_idx=cache_idx,
-                    lazy_mode=lazy_mode,
                 )
 
             hidden_states = layer_outputs[0]
@@ -759,7 +749,6 @@ def forward(
         reuse_cache: Optional[bool] = None,
         flash_attention_recompute: Optional[bool] = False,
         cache_idx: int = None,
-        lazy_mode: Optional[bool] = True,
     ) -> Union[Tuple, MoeCausalLMOutputWithPast]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_router_logits = (
@@ -788,7 +777,6 @@ def forward(
             reuse_cache=reuse_cache,
             flash_attention_recompute=flash_attention_recompute,
             cache_idx=cache_idx,
-            lazy_mode=lazy_mode,
         )
 
         hidden_states = outputs[0]
@@ -893,7 +881,6 @@ def prepare_inputs_for_generation(
                 "reuse_cache": reuse_cache,
                 "flash_attention_recompute": kwargs.get("flash_attention_recompute"),
                 "cache_idx": kwargs.get("cache_idx"),
-                "lazy_mode": kwargs.get("lazy_mode"),
             }
         )
         return model_inputs

From 7526e3c3dfe5e4454e91c600f437b8ff3569d163 Mon Sep 17 00:00:00 2001
From: Dina Suehiro Jones <dina.s.jones@intel.com>
Date: Fri, 23 Aug 2024 05:56:42 -0700
Subject: [PATCH 03/41] Remove huggingface_hub install that is no longer needed
 in the kubernetes example (#1286)

Signed-off-by: dmsuehir <dina.s.jones@intel.com>
---
 examples/kubernetes/requirements.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/kubernetes/requirements.txt b/examples/kubernetes/requirements.txt
index ee4e985e93..dc06e3de79 100644
--- a/examples/kubernetes/requirements.txt
+++ b/examples/kubernetes/requirements.txt
@@ -1,3 +1,2 @@
-huggingface_hub==0.23.0
 -r optimum-habana/examples/language-modeling/requirements.txt
 -r optimum-habana/examples/text-classification/requirements.txt

From 273ca05da6d18f47f1567eda0211a1b54a1c54d8 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Fri, 23 Aug 2024 06:14:55 -0700
Subject: [PATCH 04/41] Add missing condtion check in tensor creation in greedy
 search (#1288)

---
 optimum/habana/transformers/generation/utils.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index aa342b392e..d333986679 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -2194,7 +2194,8 @@ def _sample(
         # keep track of which sequences are already finished
         batch_size, cur_len = input_ids.shape
         this_peer_finished = False
-        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        if not ignore_eos:
+            unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
         model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
 
         bucket_size = model_kwargs.get("bucket_size", -1)
@@ -2269,9 +2270,7 @@ def _sample(
                         next_token_logits = torch.index_select(outputs.logits, -2, token_idx - 1).squeeze(-2)
                     next_token_scores = logits_processor(input_ids, next_token_logits)
             else:
-                # Clone is needed to avoid keeping a hanging ref to outputs.logits which may be very large for first iteration
-                # (the clone itself is always small)
-                next_token_logits = outputs.logits[:, -1, :].clone()
+                next_token_logits = outputs.logits[:, -1, :]
                 if token_idx is not None and self.config.is_encoder_decoder:
                     # case2 (with KV caching): outputs.logits.shape: [batch_size, 1, vocab_size]
                     next_token_scores = logits_processor(input_ids[:, :token_idx], next_token_logits)

From de9f8b754a14ef71ef79bb349832a3320c2d932b Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 23 Aug 2024 15:15:28 +0200
Subject: [PATCH 05/41] Fix BERT FSDP test (#1281)

---
 optimum/habana/accelerate/utils/__init__.py   |  1 +
 optimum/habana/accelerate/utils/other.py      | 77 +++++++++++++++++++
 optimum/habana/transformers/modeling_utils.py |  5 ++
 3 files changed, 83 insertions(+)
 create mode 100644 optimum/habana/accelerate/utils/other.py

diff --git a/optimum/habana/accelerate/utils/__init__.py b/optimum/habana/accelerate/utils/__init__.py
index ee25954b95..b862697de1 100755
--- a/optimum/habana/accelerate/utils/__init__.py
+++ b/optimum/habana/accelerate/utils/__init__.py
@@ -5,6 +5,7 @@
     GaudiFullyShardedDataParallelPlugin,
     GaudiTorchDynamoPlugin,
 )
+from .other import extract_model_from_parallel
 from .transformer_engine import (
     FP8ContextWrapper,
     convert_model,
diff --git a/optimum/habana/accelerate/utils/other.py b/optimum/habana/accelerate/utils/other.py
new file mode 100644
index 0000000000..8062f9d860
--- /dev/null
+++ b/optimum/habana/accelerate/utils/other.py
@@ -0,0 +1,77 @@
+from types import MethodType
+
+import torch
+from accelerate.utils.constants import FSDP_PYTORCH_VERSION
+from accelerate.utils.imports import is_deepspeed_available, is_torch_distributed_available
+from accelerate.utils.other import is_compiled_module
+from accelerate.utils.transformer_engine import convert_model
+from accelerate.utils.versions import is_torch_version
+
+
+def extract_model_from_parallel(model, keep_fp32_wrapper: bool = True, recursive: bool = False):
+    """
+    Adapted from: https://github.com/huggingface/accelerate/blob/v0.33.0/src/accelerate/utils/other.py#L56
+
+    Changes:
+    - add a `distributed_model` variable to keep track of the distributed wrapper
+      and not lose it when setting it back at the end (for compiled models)
+
+    See https://github.com/huggingface/optimum-habana/pull/1281 for more information.
+    """
+    options = (torch.nn.parallel.DistributedDataParallel, torch.nn.DataParallel)
+
+    is_compiled = is_compiled_module(model)
+    if is_compiled:
+        compiled_model = model
+        model = model._orig_mod
+
+    if is_deepspeed_available():
+        from deepspeed import DeepSpeedEngine
+
+        options += (DeepSpeedEngine,)
+
+    if is_torch_version(">=", FSDP_PYTORCH_VERSION) and is_torch_distributed_available():
+        from torch.distributed.fsdp.fully_sharded_data_parallel import FullyShardedDataParallel as FSDP
+
+        options += (FSDP,)
+
+    # Keep track of the distributed wrapper
+    # TODO: to revisit as lines 44 to 71 are now useless
+    distributed_model = model
+    while isinstance(model, options):
+        model = model.module
+
+    if recursive:
+        # This is needed in cases such as using FSDPv2 on XLA
+        def _recursive_unwrap(module):
+            # Wrapped modules are standardly wrapped as `module`, similar to the cases earlier
+            # with DDP, DataParallel, DeepSpeed, and FSDP
+            if hasattr(module, "module"):
+                unwrapped_module = _recursive_unwrap(module.module)
+            else:
+                unwrapped_module = module
+            # Next unwrap child sublayers recursively
+            for name, child in unwrapped_module.named_children():
+                setattr(unwrapped_module, name, _recursive_unwrap(child))
+            return unwrapped_module
+
+        # Start with top-level
+        model = _recursive_unwrap(model)
+
+    if not keep_fp32_wrapper:
+        forward = model.forward
+        original_forward = model.__dict__.pop("_original_forward", None)
+        if original_forward is not None:
+            while hasattr(forward, "__wrapped__"):
+                forward = forward.__wrapped__
+                if forward == original_forward:
+                    break
+            model.forward = MethodType(forward, model)
+        if getattr(model, "_converted_to_transformer_engine", False):
+            convert_model(model, to_transformer_engine=False)
+
+    if is_compiled:
+        compiled_model._orig_mod = distributed_model
+        model = compiled_model
+
+    return model
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
index 621e391bfb..2b7bb32bce 100644
--- a/optimum/habana/transformers/modeling_utils.py
+++ b/optimum/habana/transformers/modeling_utils.py
@@ -13,9 +13,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import accelerate
 import transformers
 import transformers.utils.fx
 
+from ..accelerate.utils import extract_model_from_parallel
 from .generation import (
     GaudiGenerationConfig,
     GaudiGenerationMixin,
@@ -201,6 +203,9 @@ def adapt_transformers_to_gaudi():
     Replaces some Transformers' methods for equivalent methods optimized
     for Gaudi.
     """
+    accelerate.utils.extract_model_from_parallel = extract_model_from_parallel
+    accelerate.utils.other.extract_model_from_parallel = extract_model_from_parallel
+    accelerate.accelerator.extract_model_from_parallel = extract_model_from_parallel
 
     # models that support symbolic tracing should be added to this list
     models_with_tracing_support = []

From 7e4d7f1b3ef8dfbe2847a79cdb51a4fb670d9e18 Mon Sep 17 00:00:00 2001
From: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
Date: Fri, 23 Aug 2024 13:16:16 -0700
Subject: [PATCH 06/41] Llava: Added flash_attention_recompute arg to provide
 an option to enable/disable recompute (#1278)

---
 examples/image-to-text/README.md               | 14 ++++++++++----
 examples/image-to-text/run_pipeline.py         |  6 ++++++
 .../transformers/models/clip/modeling_clip.py  | 18 +++++++++++++++---
 .../models/llava/modeling_llava.py             | 10 ++++++++--
 .../models/llava_next/modeling_llava_next.py   | 12 ++++++++++--
 5 files changed, 49 insertions(+), 11 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 0f1a2624d4..97494e6846 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -145,7 +145,8 @@ python3 run_pipeline.py \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
     --bf16 \
-    --use_flash_attention
+    --use_flash_attention \
+    --flash_attention_recompute
 ```
 
 
@@ -156,7 +157,8 @@ python3 run_pipeline.py \
     --image_path "https://llava-vl.github.io/static/images/view.jpg" \
     --use_hpu_graphs \
     --bf16 \
-    --use_flash_attention
+    --use_flash_attention \
+    --flash_attention_recompute
 ```
 
 
@@ -168,7 +170,9 @@ QUANT_CONFIG=./quantization_config/maxabs_measure.json python run_pipeline.py \
 --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
 --image_path "https://llava-vl.github.io/static/images/view.jpg" \
 --use_hpu_graphs \
---bf16 --use_flash_attention
+--bf16 \
+--use_flash_attention \
+--flash_attention_recompute
 ```
 
 Here is an example of quantizing the model based on previous measurements for Llava-v1.6-mistral-7b:
@@ -177,5 +181,7 @@ QUANT_CONFIG=./quantization_config/maxabs_quant.json python run_pipeline.py \
 --model_name_or_path llava-hf/llava-v1.6-mistral-7b-hf \
 --image_path "https://llava-vl.github.io/static/images/view.jpg" \
 --use_hpu_graphs \
---bf16 --use_flash_attention
+--bf16 \
+--use_flash_attention \
+--flash_attention_recompute
 ```
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index 239d6fa4e4..8ef45c4f61 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -96,6 +96,11 @@ def main():
         action="store_true",
         help="Whether to enable Habana Flash Attention, provided that the model supports it.",
     )
+    parser.add_argument(
+        "--flash_attention_recompute",
+        action="store_true",
+        help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
+    )
 
     args = parser.parse_args()
 
@@ -156,6 +161,7 @@ def main():
         "max_new_tokens": args.max_new_tokens,
         "ignore_eos": args.ignore_eos,
         "use_flash_attention": args.use_flash_attention,
+        "flash_attention_recompute": args.flash_attention_recompute,
     }
     if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
diff --git a/optimum/habana/transformers/models/clip/modeling_clip.py b/optimum/habana/transformers/models/clip/modeling_clip.py
index b22c61972d..b7fb3a222e 100644
--- a/optimum/habana/transformers/models/clip/modeling_clip.py
+++ b/optimum/habana/transformers/models/clip/modeling_clip.py
@@ -78,11 +78,13 @@ def forward(
         causal_attention_mask: Optional[torch.Tensor] = None,
         output_attentions: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
         """
         Copied from CLIPAttention.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention to enable FusedSDPA
+        - add new args flash_attention_recompute
         """
         bsz, tgt_len, embed_dim = hidden_states.size()
         attn_weights_reshaped = None
@@ -100,8 +102,7 @@ def forward(
         if FusedSDPA and use_flash_attention:
             import habana_frameworks.torch.hpu as ht
 
-            use_recompute = not self.training
-            with ht.sdp_kernel(enable_recompute=use_recompute):
+            with ht.sdp_kernel(enable_recompute=flash_attention_recompute):
                 attn_output = self.fused_scaled_dot_product_attention(
                     query_states, key_states, value_states, attention_mask, self.dropout, False, 1, "fast"
                 )
@@ -178,11 +179,13 @@ def forward(
         causal_attention_mask: torch.Tensor,
         output_attentions: Optional[bool] = False,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Tuple[torch.FloatTensor]:
         """
         Copied from CLIPEncoderLayer.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         residual = hidden_states
 
@@ -193,6 +196,7 @@ def forward(
             causal_attention_mask=causal_attention_mask,
             output_attentions=output_attentions,
             use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
         hidden_states = residual + hidden_states
 
@@ -219,11 +223,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutput]:
         """
         Copied from CLIPEncoder.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -245,7 +251,6 @@ def forward(
                     attention_mask,
                     causal_attention_mask,
                     output_attentions,
-                    use_flash_attention=use_flash_attention,
                 )
             else:
                 layer_outputs = encoder_layer(
@@ -254,6 +259,7 @@ def forward(
                     causal_attention_mask,
                     output_attentions=output_attentions,
                     use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
                 )
 
             hidden_states = layer_outputs[0]
@@ -279,11 +285,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         """
         Copied from CLIPVisionTransformer.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -303,6 +311,7 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
 
         last_hidden_state = encoder_outputs[0]
@@ -328,11 +337,13 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, BaseModelOutputWithPooling]:
         """
         Copied from CLIPVisionModel.forward: https://github.com/huggingface/transformers/blob/ab0f050b42d903f34d6eb97f3f8c0c07f0517ad2/src/transformers/models/clip/modeling_clip.py
         The only differences are:
         - add new args use_flash_attention
+        - add new args flash_attention_recompute
         """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
 
@@ -342,4 +353,5 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             use_flash_attention=use_flash_attention,
+            flash_attention_recompute=flash_attention_recompute,
         )
diff --git a/optimum/habana/transformers/models/llava/modeling_llava.py b/optimum/habana/transformers/models/llava/modeling_llava.py
index fa3a321e77..8119f442c5 100644
--- a/optimum/habana/transformers/models/llava/modeling_llava.py
+++ b/optimum/habana/transformers/models/llava/modeling_llava.py
@@ -124,6 +124,7 @@ def forward(
         image_offset: Optional[int] = None,
         tokens_pos: Optional[torch.LongTensor] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, LlavaCausalLMOutputWithPast]:
         """
         Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/llava/modeling_llava.py
@@ -154,7 +155,10 @@ def forward(
             # 2. Merge text and images
             if pixel_values is not None and input_ids.shape[1] != 1:
                 image_outputs = self.vision_tower(
-                    pixel_values, output_hidden_states=True, use_flash_attention=use_flash_attention
+                    pixel_values,
+                    output_hidden_states=True,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
                 )
                 # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated.
                 selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
@@ -184,7 +188,7 @@ def forward(
                 return_dict=return_dict,
                 token_idx=token_idx + image_offset,
                 use_flash_attention=use_flash_attention,
-                flash_attention_recompute=use_flash_attention,
+                flash_attention_recompute=flash_attention_recompute,
             )
 
             if input_ids.shape[1] != 1 and pixel_values is not None:
@@ -296,6 +300,7 @@ def prepare_inputs_for_generation(
         else:
             model_inputs = {"input_ids": input_ids}
         use_flash_attention = kwargs.get("use_flash_attention", False)
+        flash_attention_recompute = kwargs.get("flash_attention_recompute", False)
         model_inputs.update(
             {
                 "position_ids": position_ids,
@@ -307,6 +312,7 @@ def prepare_inputs_for_generation(
                 "image_offset": image_offset,
                 "tokens_pos": tokens_pos,
                 "use_flash_attention": use_flash_attention,
+                "flash_attention_recompute": flash_attention_recompute,
             }
         )
 
diff --git a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
index fdf9276123..4670469e9e 100644
--- a/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
+++ b/optimum/habana/transformers/models/llava_next/modeling_llava_next.py
@@ -55,11 +55,14 @@ def forward(
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
         use_flash_attention: Optional[bool] = False,
+        flash_attention_recompute: Optional[bool] = False,
     ) -> Union[Tuple, LlavaNextCausalLMOutputWithPast]:
         """
         Inherits from LlavaForConditionalGeneration: https://github.com/huggingface/transformers/blob/v4.40.0/src/transformers/models/llava_next/modeling_llava_next.py#L433
         The only differences are:
         - add new args token_idx
+        - add new args use_flash_attention
+        - add new args flash_attention_recompute
         - Moved the process of merging images into inputs_embeds into prepare_inputs_for_generation
         """
 
@@ -83,7 +86,7 @@ def forward(
                 return_dict=return_dict,
                 token_idx=token_idx + self.image_offset,
                 use_flash_attention=use_flash_attention,
-                flash_attention_recompute=use_flash_attention,
+                flash_attention_recompute=flash_attention_recompute,
             )
 
             if inputs_embeds.shape[1] != 1 and pixel_values is not None:
@@ -248,6 +251,7 @@ def prepare_inputs_for_generation(
             )
         else:
             use_flash_attention = kwargs.get("use_flash_attention", False)
+            flash_attention_recompute = kwargs.get("flash_attention_recompute", False)
             position_ids = kwargs.get("position_ids", None)
             labels = kwargs.get("labels", None)
             if past_key_values is None and pixel_values is not None and input_ids.shape[1] != 1:
@@ -268,7 +272,10 @@ def prepare_inputs_for_generation(
                 batch_size, num_patches, num_channels, height, width = pixel_values.shape
                 reshaped_pixel_values = pixel_values.view(batch_size * num_patches, num_channels, height, width)
                 image_features = self.vision_tower(
-                    reshaped_pixel_values, output_hidden_states=True, use_flash_attention=use_flash_attention
+                    reshaped_pixel_values,
+                    output_hidden_states=True,
+                    use_flash_attention=use_flash_attention,
+                    flash_attention_recompute=flash_attention_recompute,
                 )
 
                 selected_image_feature = image_features.hidden_states[vision_feature_layer]
@@ -390,6 +397,7 @@ def prepare_inputs_for_generation(
                     "image_sizes": image_sizes,
                     "labels": labels,
                     "use_flash_attention": use_flash_attention,
+                    "flash_attention_recompute": flash_attention_recompute,
                 }
             )
 

From 19bb1d911331f2ccfd96bb3ba9530275ca124f1c Mon Sep 17 00:00:00 2001
From: Sayantan Sarkar <supersarkar@gmail.com>
Date: Fri, 23 Aug 2024 13:42:33 -0700
Subject: [PATCH 07/41] Get seq len fix propagate (#1291)

Co-authored-by: Libin Tang <litang@habana.ai>
Co-authored-by: regisss <15324346+regisss@users.noreply.github.com>
---
 examples/text-generation/README.md                     | 2 +-
 optimum/habana/transformers/models/phi/modeling_phi.py | 7 ++++++-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 29b754731d..b720936ff4 100755
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -284,7 +284,7 @@ PT_ENABLE_INT64_SUPPORT=1 PT_HPU_LAZY_MODE=0 python ../gaudi_spawn.py  --world_s
 
 ### Running with FP8
 
-Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the Intel Neural Compressor (INC), which provides model measurement and quantization capabilities in PyTorch.
+Llama2-70b, Llama2-7b, Llama3-70b, Llama3-8b, Mixtral-8x7B, Falcon-7B, Falcon-40B, Falcon-180B and phi-2 in FP8 are enabled using the [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. From synapse 1.17 / optimum-habana 1.13 release, INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
 More information on enabling fp8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
diff --git a/optimum/habana/transformers/models/phi/modeling_phi.py b/optimum/habana/transformers/models/phi/modeling_phi.py
index 07f4d0cd71..1e21735add 100644
--- a/optimum/habana/transformers/models/phi/modeling_phi.py
+++ b/optimum/habana/transformers/models/phi/modeling_phi.py
@@ -430,7 +430,12 @@ def forward(
             inputs_embeds = self.embed_tokens(input_ids)
 
         if cache_position is None:
-            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            past_seen_tokens = 0
+            if past_key_values is not None:
+                if isinstance(past_key_values, Cache):
+                    past_seen_tokens = past_key_values.get_seq_length()
+                else:
+                    past_seen_tokens = past_key_values[0][0].shape[2]
             cache_position = torch.arange(
                 past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
             )

From 586b7730f019092b03fbc8f4ca1369f49bae98c1 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Sun, 25 Aug 2024 13:35:39 +0000
Subject: [PATCH 08/41] Update last stable release in README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2edc24d2f8..b78a19080e 100644
--- a/README.md
+++ b/README.md
@@ -59,9 +59,9 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is up
 To use the example associated with the latest stable release, run:
 > ```
 > git clone https://github.com/huggingface/optimum-habana
-> cd optimum-habana && git checkout v1.13.0
+> cd optimum-habana && git checkout v1.13.1
 > ```
-> with `v1.13.0` the version number of this release.
+> with `v1.13.1` the version number of this release.
 
 ### Option 2: Use the latest main branch under development
 

From 34ee57bfdfedbd37d979f2c87e202220dd750c19 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 28 Aug 2024 16:38:52 +0000
Subject: [PATCH 09/41] Update minimal required versions in examples

---
 examples/audio-classification/run_audio_classification.py     | 2 +-
 examples/contrastive-image-text/run_bridgetower.py            | 2 +-
 examples/contrastive-image-text/run_clip.py                   | 2 +-
 examples/image-classification/run_image_classification.py     | 2 +-
 examples/language-modeling/run_clm.py                         | 2 +-
 examples/language-modeling/run_lora_clm.py                    | 2 +-
 examples/language-modeling/run_mlm.py                         | 2 +-
 examples/language-modeling/run_multitask_prompt_tuning.py     | 4 ++--
 examples/language-modeling/run_prompt_tuning_clm.py           | 4 ++--
 examples/protein-folding/run_esmfold.py                       | 2 +-
 examples/protein-folding/run_sequence_classification.py       | 2 +-
 examples/protein-folding/run_zero_shot_eval.py                | 2 +-
 examples/question-answering/run_qa.py                         | 2 +-
 examples/question-answering/run_seq2seq_qa.py                 | 2 +-
 examples/speech-recognition/run_speech_recognition_ctc.py     | 2 +-
 examples/speech-recognition/run_speech_recognition_seq2seq.py | 2 +-
 examples/stable-diffusion/image_to_image_generation.py        | 2 +-
 examples/stable-diffusion/image_to_video_generation.py        | 2 +-
 examples/stable-diffusion/text_to_image_generation.py         | 2 +-
 examples/stable-diffusion/training/textual_inversion.py       | 2 +-
 examples/stable-diffusion/training/train_controlnet.py        | 4 ++--
 .../stable-diffusion/training/train_dreambooth_lora_sdxl.py   | 2 +-
 .../stable-diffusion/training/train_text_to_image_sdxl.py     | 2 +-
 examples/stable-diffusion/unconditional_image_generation.py   | 4 ++--
 examples/summarization/run_summarization.py                   | 2 +-
 examples/text-classification/run_glue.py                      | 2 +-
 examples/translation/run_translation.py                       | 2 +-
 tests/example_diff/run_audio_classification.txt               | 2 +-
 tests/example_diff/run_clip.txt                               | 2 +-
 tests/example_diff/run_clm.txt                                | 2 +-
 tests/example_diff/run_glue.txt                               | 2 +-
 tests/example_diff/run_image_classification.txt               | 2 +-
 tests/example_diff/run_mlm.txt                                | 2 +-
 tests/example_diff/run_qa.txt                                 | 2 +-
 tests/example_diff/run_seq2seq_qa.txt                         | 2 +-
 tests/example_diff/run_speech_recognition_ctc.txt             | 2 +-
 tests/example_diff/run_speech_recognition_seq2seq.txt         | 2 +-
 tests/example_diff/run_summarization.txt                      | 2 +-
 tests/example_diff/run_translation.txt                        | 2 +-
 39 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/examples/audio-classification/run_audio_classification.py b/examples/audio-classification/run_audio_classification.py
index 86dc6627dd..b05e6dfb51 100644
--- a/examples/audio-classification/run_audio_classification.py
+++ b/examples/audio-classification/run_audio_classification.py
@@ -47,7 +47,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.14.0", "To fix: pip install -r examples/pytorch/audio-classification/requirements.txt")
 
diff --git a/examples/contrastive-image-text/run_bridgetower.py b/examples/contrastive-image-text/run_bridgetower.py
index c22682203e..11ff5a55b0 100644
--- a/examples/contrastive-image-text/run_bridgetower.py
+++ b/examples/contrastive-image-text/run_bridgetower.py
@@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/contrastive-image-text/run_clip.py b/examples/contrastive-image-text/run_clip.py
index 2358412de6..941dade8f9 100644
--- a/examples/contrastive-image-text/run_clip.py
+++ b/examples/contrastive-image-text/run_clip.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/contrastive-image-text/requirements.txt")
 
diff --git a/examples/image-classification/run_image_classification.py b/examples/image-classification/run_image_classification.py
index 7bd1d23c4d..4d2e229db1 100644
--- a/examples/image-classification/run_image_classification.py
+++ b/examples/image-classification/run_image_classification.py
@@ -64,7 +64,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/image-classification/requirements.txt")
 
diff --git a/examples/language-modeling/run_clm.py b/examples/language-modeling/run_clm.py
index 3e372d17a6..ec6b345d89 100644
--- a/examples/language-modeling/run_clm.py
+++ b/examples/language-modeling/run_clm.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
index f1c39f6db7..4640bb0dc9 100644
--- a/examples/language-modeling/run_lora_clm.py
+++ b/examples/language-modeling/run_lora_clm.py
@@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.10.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 @dataclass
diff --git a/examples/language-modeling/run_mlm.py b/examples/language-modeling/run_mlm.py
index 7fb0ce8494..7a660447b8 100644
--- a/examples/language-modeling/run_mlm.py
+++ b/examples/language-modeling/run_mlm.py
@@ -62,7 +62,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_multitask_prompt_tuning.py b/examples/language-modeling/run_multitask_prompt_tuning.py
index 9f7d10655c..48f9cefcb7 100644
--- a/examples/language-modeling/run_multitask_prompt_tuning.py
+++ b/examples/language-modeling/run_multitask_prompt_tuning.py
@@ -60,8 +60,8 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risk.
-check_min_version("4.38.0")
-check_optimum_habana_min_version("1.10.0")
+check_min_version("4.43.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/language-modeling/run_prompt_tuning_clm.py b/examples/language-modeling/run_prompt_tuning_clm.py
index 42798c0d5e..2d2b9c4c3e 100644
--- a/examples/language-modeling/run_prompt_tuning_clm.py
+++ b/examples/language-modeling/run_prompt_tuning_clm.py
@@ -62,8 +62,8 @@ def check_optimum_habana_min_version(*a, **b):
 logger = logging.getLogger(__name__)
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
-check_min_version("4.38.0")
-check_optimum_habana_min_version("1.10.0")
+check_min_version("4.43.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 
diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
index 489faea855..4fa8d7a441 100644
--- a/examples/protein-folding/run_esmfold.py
+++ b/examples/protein-folding/run_esmfold.py
@@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 def convert_outputs_to_pdb(outputs):
diff --git a/examples/protein-folding/run_sequence_classification.py b/examples/protein-folding/run_sequence_classification.py
index 8590e4eaa9..dde75a2564 100644
--- a/examples/protein-folding/run_sequence_classification.py
+++ b/examples/protein-folding/run_sequence_classification.py
@@ -41,7 +41,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
diff --git a/examples/protein-folding/run_zero_shot_eval.py b/examples/protein-folding/run_zero_shot_eval.py
index 348665c59d..3b475883e8 100644
--- a/examples/protein-folding/run_zero_shot_eval.py
+++ b/examples/protein-folding/run_zero_shot_eval.py
@@ -36,7 +36,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logging.basicConfig(
diff --git a/examples/question-answering/run_qa.py b/examples/question-answering/run_qa.py
index 6e0c35620f..b983055f31 100644
--- a/examples/question-answering/run_qa.py
+++ b/examples/question-answering/run_qa.py
@@ -61,7 +61,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/question-answering/run_seq2seq_qa.py b/examples/question-answering/run_seq2seq_qa.py
index e9e789b440..8249e659a1 100644
--- a/examples/question-answering/run_seq2seq_qa.py
+++ b/examples/question-answering/run_seq2seq_qa.py
@@ -57,7 +57,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 
diff --git a/examples/speech-recognition/run_speech_recognition_ctc.py b/examples/speech-recognition/run_speech_recognition_ctc.py
index f494d5ea29..c1367e0668 100644
--- a/examples/speech-recognition/run_speech_recognition_ctc.py
+++ b/examples/speech-recognition/run_speech_recognition_ctc.py
@@ -60,7 +60,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/speech-recognition/run_speech_recognition_seq2seq.py b/examples/speech-recognition/run_speech_recognition_seq2seq.py
index 66ed34f476..e9abca3b92 100755
--- a/examples/speech-recognition/run_speech_recognition_seq2seq.py
+++ b/examples/speech-recognition/run_speech_recognition_seq2seq.py
@@ -56,7 +56,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers is not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 
diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py
index 1a11d6eef9..64fecf4c91 100755
--- a/examples/stable-diffusion/image_to_image_generation.py
+++ b/examples/stable-diffusion/image_to_image_generation.py
@@ -40,7 +40,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/image_to_video_generation.py b/examples/stable-diffusion/image_to_video_generation.py
index c9142f0c0e..048f699ce2 100755
--- a/examples/stable-diffusion/image_to_video_generation.py
+++ b/examples/stable-diffusion/image_to_video_generation.py
@@ -34,7 +34,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index 035e486061..1fa231f992 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -39,7 +39,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 
 logger = logging.getLogger(__name__)
diff --git a/examples/stable-diffusion/training/textual_inversion.py b/examples/stable-diffusion/training/textual_inversion.py
index f968ac808c..db488f8749 100644
--- a/examples/stable-diffusion/training/textual_inversion.py
+++ b/examples/stable-diffusion/training/textual_inversion.py
@@ -79,7 +79,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py
index 0dd6a0102b..696b54baca 100644
--- a/examples/stable-diffusion/training/train_controlnet.py
+++ b/examples/stable-diffusion/training/train_controlnet.py
@@ -68,12 +68,12 @@ def check_optimum_habana_min_version(*a, **b):
 
 
 # Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
-check_optimum_habana_min_version("1.10.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 if is_wandb_available():
     import wandb
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
index ea34c50773..b177cf12e6 100644
--- a/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
+++ b/examples/stable-diffusion/training/train_dreambooth_lora_sdxl.py
@@ -75,7 +75,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__)
 
diff --git a/examples/stable-diffusion/training/train_text_to_image_sdxl.py b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
index 46a1f1c150..c9d84ae1b9 100644
--- a/examples/stable-diffusion/training/train_text_to_image_sdxl.py
+++ b/examples/stable-diffusion/training/train_text_to_image_sdxl.py
@@ -73,7 +73,7 @@
 
 
 # Will error if the minimal version of diffusers is not installed. Remove at your own risks.
-check_min_version("0.26.0")
+check_min_version("0.29.0")
 
 logger = get_logger(__name__, log_level="INFO")
 
diff --git a/examples/stable-diffusion/unconditional_image_generation.py b/examples/stable-diffusion/unconditional_image_generation.py
index 93ebb59824..df0575c0a7 100644
--- a/examples/stable-diffusion/unconditional_image_generation.py
+++ b/examples/stable-diffusion/unconditional_image_generation.py
@@ -19,8 +19,8 @@ def check_optimum_habana_min_version(*a, **b):
         return ()
 
 
-check_min_version("4.37.0")
-check_optimum_habana_min_version("1.10.4")
+check_min_version("4.43.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 # Setup logging
 logging.basicConfig(
diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py
index 122477aed4..ea5e002450 100755
--- a/examples/summarization/run_summarization.py
+++ b/examples/summarization/run_summarization.py
@@ -66,7 +66,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 
diff --git a/examples/text-classification/run_glue.py b/examples/text-classification/run_glue.py
index 5f5cb45b1b..9dfd2adcfc 100755
--- a/examples/text-classification/run_glue.py
+++ b/examples/text-classification/run_glue.py
@@ -58,7 +58,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/text-classification/requirements.txt")
 
diff --git a/examples/translation/run_translation.py b/examples/translation/run_translation.py
index 0dec28ed39..8d13b39923 100644
--- a/examples/translation/run_translation.py
+++ b/examples/translation/run_translation.py
@@ -63,7 +63,7 @@ def check_optimum_habana_min_version(*a, **b):
 
 # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 check_min_version("4.43.0")
-check_optimum_habana_min_version("1.13.0")
+check_optimum_habana_min_version("1.14.0.dev0")
 
 require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 
diff --git a/tests/example_diff/run_audio_classification.txt b/tests/example_diff/run_audio_classification.txt
index 1314c4bebd..5e98ce8248 100644
--- a/tests/example_diff/run_audio_classification.txt
+++ b/tests/example_diff/run_audio_classification.txt
@@ -34,7 +34,7 @@
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 174,176d175
 <     freeze_feature_extractor: Optional[bool] = field(
 <         default=None, metadata={"help": "Whether to freeze the feature extractor layers of the model."}
diff --git a/tests/example_diff/run_clip.txt b/tests/example_diff/run_clip.txt
index 3999665da1..f57b3b3240 100644
--- a/tests/example_diff/run_clip.txt
+++ b/tests/example_diff/run_clip.txt
@@ -29,7 +29,7 @@
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 181a190,192
 >     mediapipe_dataloader: bool = field(
 >         default=False, metadata={"help": "Turn on MediaPipe hardware-based accelerated data loading."}
diff --git a/tests/example_diff/run_clm.txt b/tests/example_diff/run_clm.txt
index 00bb6f6097..580f3c9684 100644
--- a/tests/example_diff/run_clm.txt
+++ b/tests/example_diff/run_clm.txt
@@ -39,7 +39,7 @@
 63a64,69
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 > 
diff --git a/tests/example_diff/run_glue.txt b/tests/example_diff/run_glue.txt
index 282e3cd6b2..26d2e245c0 100644
--- a/tests/example_diff/run_glue.txt
+++ b/tests/example_diff/run_glue.txt
@@ -28,7 +28,7 @@
 > 
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 67,68d76
 < logger = logging.getLogger(__name__)
 < 
diff --git a/tests/example_diff/run_image_classification.txt b/tests/example_diff/run_image_classification.txt
index 31b247a8ab..7a3e696fd6 100644
--- a/tests/example_diff/run_image_classification.txt
+++ b/tests/example_diff/run_image_classification.txt
@@ -29,7 +29,7 @@
 ---
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 184c192
 <     parser = HfArgumentParser((ModelArguments, DataTrainingArguments, TrainingArguments))
 ---
diff --git a/tests/example_diff/run_mlm.txt b/tests/example_diff/run_mlm.txt
index 698be685c5..a3e97b56c7 100644
--- a/tests/example_diff/run_mlm.txt
+++ b/tests/example_diff/run_mlm.txt
@@ -35,7 +35,7 @@
 > 
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=2.14.0", "To fix: pip install -r examples/pytorch/language-modeling/requirements.txt")
 > 
diff --git a/tests/example_diff/run_qa.txt b/tests/example_diff/run_qa.txt
index 60c1e52e31..4d289c5faa 100644
--- a/tests/example_diff/run_qa.txt
+++ b/tests/example_diff/run_qa.txt
@@ -33,7 +33,7 @@
 58a62,67
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 > 
diff --git a/tests/example_diff/run_seq2seq_qa.txt b/tests/example_diff/run_seq2seq_qa.txt
index 78033eeafa..96bcd84b82 100644
--- a/tests/example_diff/run_seq2seq_qa.txt
+++ b/tests/example_diff/run_seq2seq_qa.txt
@@ -25,7 +25,7 @@
 54a58,63
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/question-answering/requirements.txt")
 > 
diff --git a/tests/example_diff/run_speech_recognition_ctc.txt b/tests/example_diff/run_speech_recognition_ctc.txt
index 3d366814c3..d9bb9d115e 100644
--- a/tests/example_diff/run_speech_recognition_ctc.txt
+++ b/tests/example_diff/run_speech_recognition_ctc.txt
@@ -26,7 +26,7 @@
 59a61,66
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.18.0", "To fix: pip install -r examples/pytorch/speech-recognition/requirements.txt")
 > 
diff --git a/tests/example_diff/run_speech_recognition_seq2seq.txt b/tests/example_diff/run_speech_recognition_seq2seq.txt
index 847b742bb4..0fce8cc3e0 100644
--- a/tests/example_diff/run_speech_recognition_seq2seq.txt
+++ b/tests/example_diff/run_speech_recognition_seq2seq.txt
@@ -23,7 +23,7 @@
 < check_min_version("4.45.0.dev0")
 ---
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 230a239,242
 >     label_features_max_length: int = field(
 >         default=None,
diff --git a/tests/example_diff/run_summarization.txt b/tests/example_diff/run_summarization.txt
index 6bf6dc6aba..aaa348da39 100644
--- a/tests/example_diff/run_summarization.txt
+++ b/tests/example_diff/run_summarization.txt
@@ -37,7 +37,7 @@
 60a67,72
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/summarization/requirements.txt")
 > 
diff --git a/tests/example_diff/run_translation.txt b/tests/example_diff/run_translation.txt
index 8d17c3c087..95f2749242 100644
--- a/tests/example_diff/run_translation.txt
+++ b/tests/example_diff/run_translation.txt
@@ -29,7 +29,7 @@
 60a64,69
 > # Will error if the minimal version of Transformers and Optimum Habana are not installed. Remove at your own risks.
 > check_min_version("4.43.0")
-> check_optimum_habana_min_version("1.13.0")
+> check_optimum_habana_min_version("1.14.0.dev0")
 > 
 > require_version("datasets>=1.8.0", "To fix: pip install -r examples/pytorch/translation/requirements.txt")
 > 

From 6b2a5f843c963a9419ad246c77c3a39d671da198 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Thu, 29 Aug 2024 01:17:47 -0700
Subject: [PATCH 10/41] Update FusedSDPA calling method as Gaudi documentation
 (#1285)

---
 .../gpt_bigcode/modeling_gpt_bigcode.py       | 66 +++++++++++--------
 1 file changed, 37 insertions(+), 29 deletions(-)

diff --git a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
index 8aee605480..7d2a065593 100644
--- a/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
+++ b/optimum/habana/transformers/models/gpt_bigcode/modeling_gpt_bigcode.py
@@ -21,7 +21,16 @@
 
 
 def gaudi_flash_attn_v1(
-    query_layer, key_layer, value_layer, attention_mask, dropout_rate, is_causal, scale, softmax_mode, q_block_size
+    query_layer,
+    key_layer,
+    value_layer,
+    attention_mask,
+    dropout_rate,
+    is_causal,
+    scale,
+    softmax_mode,
+    enable_recompute,
+    q_block_size,
 ):
     """
     Gaudi version of Flash Attention V1 to support long sequence at prompt phase
@@ -42,7 +51,7 @@ def gaudi_flash_attn_v1(
         row_q = query_layer[:, :, s:e, :]
         row_mask = attention_mask[:, :, s:e, :]
         attn_output_partial = FusedSDPA.apply(
-            row_q, key_layer, value_layer, row_mask, dropout_rate, is_causal, scale, softmax_mode
+            row_q, key_layer, value_layer, row_mask, dropout_rate, is_causal, scale, softmax_mode, enable_recompute
         )
         row_o_list.append(attn_output_partial)
     attn_output = torch.cat(row_o_list, dim=-2)
@@ -106,33 +115,32 @@ def apply_FusedSDPA(
     else:
         use_causal_mask = self.is_causal and attention_mask is None and query_length > 1
 
-    import habana_frameworks.torch.hpu as ht
-
-    with ht.sdp_kernel(enable_recompute=enable_recompute):
-        if query_length > 8192:
-            sdpa_result = gaudi_flash_attn_v1(
-                query,
-                key,
-                value,
-                attention_mask,
-                self.attn_pdrop if self.training else 0.0,
-                use_causal_mask,
-                scale,
-                "fast" if flash_attention_fast_softmax else "None",
-                4096,
-            )
-            htcore.mark_step()
-        else:
-            sdpa_result = FusedSDPA.apply(
-                query,
-                key,
-                value,
-                attention_mask,
-                self.attn_pdrop if self.training else 0.0,
-                use_causal_mask,
-                scale,
-                "fast" if flash_attention_fast_softmax else "None",
-            )
+    if query_length > 8192:
+        sdpa_result = gaudi_flash_attn_v1(
+            query,
+            key,
+            value,
+            attention_mask,
+            self.attn_pdrop if self.training else 0.0,
+            use_causal_mask,
+            scale,
+            "fast" if flash_attention_fast_softmax else "None",
+            enable_recompute,
+            4096,
+        )
+        htcore.mark_step()
+    else:
+        sdpa_result = FusedSDPA.apply(
+            query,
+            key,
+            value,
+            attention_mask,
+            self.attn_pdrop if self.training else 0.0,
+            use_causal_mask,
+            scale,
+            "fast" if flash_attention_fast_softmax else "None",
+            enable_recompute,
+        )
 
     if self.multi_query:
         # (batch_size, num_heads, seq_len, head_dim) --> (batch_size, seq_len, num_heads, head_dim)

From 0278196212e45ae8d54ad6a3b13bc31970e1f5af Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Thu, 29 Aug 2024 07:10:52 -0700
Subject: [PATCH 11/41] Mixtral fp8 tests (#1269)

---
 tests/test_text_generation_example.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index c2ac38e873..9c4e983576 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -57,7 +57,11 @@
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 128, 2048, 6979.225194247115),
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 120, True, 2048, 128, 1681.4401450088983),
             ("mistralai/Mistral-7B-Instruct-v0.2", 1, 44, True, 2048, 2048, 3393.149396451692),
-            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 39.26845661768185),
+            ("mistralai/Mixtral-8x7B-v0.1", 1, 1, True, 128, 128, 40.94),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 768, True, 128, 128, 3428.65),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 128, 2048, 2570.34),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 96, True, 2048, 128, 379.03),
+            ("mistralai/Mixtral-8x7B-v0.1", 2, 48, True, 2048, 2048, 1147.50),
             ("microsoft/phi-2", 1, 1, True, 128, 128, 254.08932787178165),
         ],
         "deepspeed": [
@@ -200,6 +204,9 @@ def _test_text_generation(
             command.insert(-2, "--flash_attention_recompute")
             command.insert(-2, "--attn_softmax_bf16")
             command.insert(-2, "--trim_logits")
+        if "Mixtral" in model_name:
+            command.insert(-2, "--bucket_size 128")
+            command.insert(-2, "--bucket_internal")
         elif "falcon-180b" in model_name.lower():
             command.insert(-2, "--flash_attention_recompute")
 
@@ -254,9 +261,14 @@ def _test_text_generation(
                         e.args = (f"The following command failed:\n{' '.join(measure_command[:-2])}",)
                     raise
 
-            env_variables["QUANT_CONFIG"] = os.path.join(
-                path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
-            )
+            if "Mixtral" in model_name:
+                env_variables["QUANT_CONFIG"] = os.path.join(
+                    path_to_example_dir, "text-generation/quantization_config/maxabs_quant_mixtral.json"
+                )
+            else:
+                env_variables["QUANT_CONFIG"] = os.path.join(
+                    path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
+                )
 
         command = [x for y in command for x in re.split(pattern, y) if x]
         print(f"\n\nCommand to test: {' '.join(command[:-2])}\n")

From d9c28e0c4b122648f0b8c0a480a5ad2dbb70e486 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Thu, 29 Aug 2024 18:52:47 +0200
Subject: [PATCH 12/41] Switch failed code quality check comment to
 `workflow_run` (#1297)

---
 .github/workflows/check_code_quality.yml      | 16 +++++++-----
 .../failed_code_quality_check_comment.yml     | 26 ++++++++++++++-----
 2 files changed, 30 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/check_code_quality.yml b/.github/workflows/check_code_quality.yml
index be453de234..d68db62369 100644
--- a/.github/workflows/check_code_quality.yml
+++ b/.github/workflows/check_code_quality.yml
@@ -35,9 +35,13 @@ jobs:
         source venv/bin/activate
         ruff check . setup.py
         ruff format --check . setup.py
-  post-comment:
-    if: failure() && github.event_name == 'pull_request'
-    needs: check
-    uses: ./.github/workflows/failed_code_quality_check_comment.yml
-    with:
-      pr-number: ${{ github.event.number }}
+    - name: Store PR number if failure to post comment
+      if: failure() && github.event_name == 'pull_request'
+      env:
+        PR_NUMBER: ${{ github.event.number }}
+      run: echo $PR_NUMBER > ./pr_number
+    - uses: actions/upload-artifact@v4
+      if: failure() && github.event_name == 'pull_request'
+      with:
+        name: pr-number
+        path: ./pr_number
diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 94aa22c70d..c5dfce0e2f 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -1,18 +1,32 @@
 name: Post comment in PR for failed code quality check
 
 on:
-  workflow_call:
-    inputs:
-      pr-number:
-        required: true
-        type: number
+  workflow_run:
+    workflows: ["Check code quality"]
+    types:
+      - completed
 
 jobs:
   post-comment:
     runs-on: ubuntu-latest
+    if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
+      - uses: actions/download-artifact@v4
+        with:
+          name: pr-number
+      - name: Get PR number
+        id: github-context
+        run: |
+          content_pr_number=$(cat ./pr_number)
+          if [[ $content_pr_number =~ ^[0-9]+$ ]]; then
+            echo "pr_number=$content_pr_number" >> $GITHUB_OUTPUT
+            rm -rf ./pr_number
+          else
+            echo "Encountered an invalid PR number"
+            exit 1
+          fi
       - uses: peter-evans/create-or-update-comment@v4
         with:
-          issue-number: ${{ inputs.pr-number }}
+          issue-number: ${{ steps.github-context.outputs.pr_number }}
           body: The code quality check failed, please run `make style`.

From 33d281f809d5e166e2dbe615e40590486e066343 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 10:57:50 +0200
Subject: [PATCH 13/41] Potential fix for the failed code quality check comment
 workflow (#1299)

---
 .../failed_code_quality_check_comment.yml     | 24 +++++++++++++++++--
 1 file changed, 22 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index c5dfce0e2f..89d978f5e1 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,9 +12,29 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - uses: actions/download-artifact@v4
+      - name: Download artifact
+        uses: actions/github-script@v3.1.0
         with:
-          name: pr-number
+          script: |
+            var artifacts = await github.actions.listWorkflowRunArtifacts({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              run_id: ${{github.event.workflow_run.id }},
+            });
+            var matchArtifact = artifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "pr-number"
+            })[0];
+            var download = await github.actions.downloadArtifact({
+              owner: context.repo.owner,
+              repo: context.repo.repo,
+              artifact_id: matchArtifact.id,
+              archive_format: 'zip',
+            });
+            var fs = require('fs');
+            fs.writeFileSync('${{steps.setup-env.outputs.current_work_dir}}/pr-number.zip', Buffer.from(download.data));
+      - run: unzip pr-number.zip
+      - name: Display structure of downloaded files
+        run: ls -l
       - name: Get PR number
         id: github-context
         run: |

From bd9f6bc4edfdbf6504d87e8fa7c7b9061260d349 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:04:56 +0000
Subject: [PATCH 14/41] Potential fix 2 for failed code quality check comment
 workflow

---
 .../failed_code_quality_check_comment.yml     | 25 +++----------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 89d978f5e1..11db76982c 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,29 +12,10 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - name: Download artifact
-        uses: actions/github-script@v3.1.0
+      - uses: actions/checkout@v2
+      - uses: actions/download-artifact@v4
         with:
-          script: |
-            var artifacts = await github.actions.listWorkflowRunArtifacts({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              run_id: ${{github.event.workflow_run.id }},
-            });
-            var matchArtifact = artifacts.data.artifacts.filter((artifact) => {
-              return artifact.name == "pr-number"
-            })[0];
-            var download = await github.actions.downloadArtifact({
-              owner: context.repo.owner,
-              repo: context.repo.repo,
-              artifact_id: matchArtifact.id,
-              archive_format: 'zip',
-            });
-            var fs = require('fs');
-            fs.writeFileSync('${{steps.setup-env.outputs.current_work_dir}}/pr-number.zip', Buffer.from(download.data));
-      - run: unzip pr-number.zip
-      - name: Display structure of downloaded files
-        run: ls -l
+          name: pr-number
       - name: Get PR number
         id: github-context
         run: |

From 8af8ed21f4df61d9cf03a012f86975581a745dba Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:08:12 +0000
Subject: [PATCH 15/41] Potential fix 3 for failed code quality check workflow

---
 .github/workflows/failed_code_quality_check_comment.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 11db76982c..f81f16e3fe 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,10 +12,10 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - uses: actions/checkout@v2
       - uses: actions/download-artifact@v4
         with:
           name: pr-number
+          run-id: ${{github.event.workflow_run.id }}
       - name: Get PR number
         id: github-context
         run: |

From e9c73f6215489a6319891b5d3eb051078697739f Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:22:00 +0000
Subject: [PATCH 16/41] Other potentiel fix

---
 .github/workflows/failed_code_quality_check_comment.yml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index f81f16e3fe..0e6b42fb27 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -12,10 +12,11 @@ jobs:
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
     steps:
-      - uses: actions/download-artifact@v4
+      - name: Download artifact
+        uses: dawidd6/action-download-artifact@v6
         with:
           name: pr-number
-          run-id: ${{github.event.workflow_run.id }}
+          run_id: ${{github.event.workflow_run.id }}
       - name: Get PR number
         id: github-context
         run: |

From 04effa5dacaf51b05a8a322350994d9ea626c1a7 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 30 Aug 2024 09:39:50 +0000
Subject: [PATCH 17/41] New potential fix

---
 .github/workflows/failed_code_quality_check_comment.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/failed_code_quality_check_comment.yml b/.github/workflows/failed_code_quality_check_comment.yml
index 0e6b42fb27..9d376fc62f 100644
--- a/.github/workflows/failed_code_quality_check_comment.yml
+++ b/.github/workflows/failed_code_quality_check_comment.yml
@@ -11,6 +11,8 @@ jobs:
     runs-on: ubuntu-latest
     if: github.event.workflow_run.event == 'pull_request' && github.event.workflow_run.conclusion == 'failure'
     name: Post comment to run make style
+    permissions:
+      pull-requests: write
     steps:
       - name: Download artifact
         uses: dawidd6/action-download-artifact@v6

From 35e01456b6833d68e070ade4f46d8c4f88091e53 Mon Sep 17 00:00:00 2001
From: Daniel Huang <daniel1.huang@intel.com>
Date: Fri, 30 Aug 2024 04:59:04 -0700
Subject: [PATCH 18/41] Enabling Text to Video Diffusion Model Generation
 (#1109)

---
 README.md                                     |   1 +
 docs/source/index.mdx                         |   1 +
 examples/text-to-video/README.md              |  41 ++
 examples/text-to-video/requirements.txt       |   1 +
 .../text-to-video/text_to_video_generation.py | 216 ++++++++
 optimum/habana/diffusers/__init__.py          |   1 +
 .../pipeline_text_to_video_synth.py           | 465 ++++++++++++++++++
 tests/test_diffusers.py                       | 127 +++++
 8 files changed, 853 insertions(+)
 create mode 100644 examples/text-to-video/README.md
 create mode 100644 examples/text-to-video/requirements.txt
 create mode 100755 examples/text-to-video/text_to_video_generation.py
 create mode 100644 optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py

diff --git a/README.md b/README.md
index b78a19080e..2decc5ec47 100644
--- a/README.md
+++ b/README.md
@@ -230,6 +230,7 @@ The following model architectures, tasks and device distributions have been vali
 | Stable Diffusion | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D            |          | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
+| Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
 
 </div>
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 9b6de456c5..17f099f8ee 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -84,6 +84,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 | Stable Diffusion    | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
+| Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
 
 - PyTorch Image Models/TIMM:
 
diff --git a/examples/text-to-video/README.md b/examples/text-to-video/README.md
new file mode 100644
index 0000000000..1df4e44e59
--- /dev/null
+++ b/examples/text-to-video/README.md
@@ -0,0 +1,41 @@
+<!---
+Copyright 2024 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+# Text to Video Examples
+
+This directory contains a script that showcases how to use the `GaudiTextToVideoSDPipeline` to run text-to-video generation tasks on HPUs.
+
+## Requirements
+
+First, you should install the requirements:
+
+```bash
+pip install -r requirements.txt
+```
+
+## Single-HPU inference
+
+```bash
+python3 text_to_video_generation.py \
+    --model_name_or_path ali-vilab/text-to-video-ms-1.7b \
+    --prompts "An astronaut riding a horse" \
+    --use_habana \
+    --use_hpu_graphs \
+    --dtype bf16
+```
+
+Models that have been validated:
+  - [ali-vilab/text-to-video-ms-1.7b](https://huggingface.co/ali-vilab/text-to-video-ms-1.7b)
diff --git a/examples/text-to-video/requirements.txt b/examples/text-to-video/requirements.txt
new file mode 100644
index 0000000000..6ab6d0d570
--- /dev/null
+++ b/examples/text-to-video/requirements.txt
@@ -0,0 +1 @@
+opencv-python-headless
diff --git a/examples/text-to-video/text_to_video_generation.py b/examples/text-to-video/text_to_video_generation.py
new file mode 100755
index 0000000000..4a91359617
--- /dev/null
+++ b/examples/text-to-video/text_to_video_generation.py
@@ -0,0 +1,216 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+# Adapted from ../stable-diffusion/text_to_image_generation.py
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+
+import torch
+from diffusers.utils.export_utils import export_to_video
+
+from optimum.habana.diffusers import GaudiTextToVideoSDPipeline
+from optimum.habana.transformers.gaudi_configuration import GaudiConfig
+from optimum.habana.utils import set_seed
+
+
+try:
+    from optimum.habana.utils import check_optimum_habana_min_version
+except ImportError:
+
+    def check_optimum_habana_min_version(*a, **b):
+        return ()
+
+
+# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
+check_optimum_habana_min_version("1.14.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="ali-vilab/text-to-video-ms-1.7b",
+        type=str,
+        help="Path to pre-trained model",
+    )
+    # Pipeline arguments
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        nargs="*",
+        default="Spiderman is surfing",
+        help="The prompt or prompts to guide the video generation.",
+    )
+    parser.add_argument(
+        "--num_videos_per_prompt", type=int, default=1, help="The number of videos to generate per prompt."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="The number of videos in a batch.")
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=0,
+        help="The height in pixels of the generated videos (0=default from model config).",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=0,
+        help="The width in pixels of the generated videos (0=default from model config).",
+    )
+    parser.add_argument("--num_frames", type=int, default=20, help="The number of frames in the generated videos.")
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help=(
+            "The number of denoising steps. More denoising steps usually lead to a higher quality videos at the expense"
+            " of slower inference."
+        ),
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7.5,
+        help=(
+            "Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598)."
+            " Higher guidance scale encourages to generate videos that are closely linked to the text `prompt`,"
+            " usually at the expense of lower video quality."
+        ),
+    )
+    parser.add_argument(
+        "--negative_prompts",
+        type=str,
+        nargs="*",
+        default=None,
+        help="The prompt or prompts not to guide the video generation.",
+    )
+    parser.add_argument(
+        "--eta",
+        type=float,
+        default=0.0,
+        help="Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502.",
+    )
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        choices=["mp4", "np"],
+        default="mp4",
+        help="Whether to return mp4 or Numpy arrays.",
+    )
+
+    parser.add_argument(
+        "--pipeline_save_dir",
+        type=str,
+        default=None,
+        help="The directory where the generation pipeline will be saved.",
+    )
+    parser.add_argument(
+        "--video_save_dir",
+        type=str,
+        default="./generated-videos",
+        help="The directory where videos will be saved.",
+    )
+
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for initialization.")
+
+    # HPU-specific arguments
+    parser.add_argument("--use_habana", action="store_true", help="Use HPU.")
+    parser.add_argument(
+        "--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU. This should lead to faster generations."
+    )
+    parser.add_argument(
+        "--dtype",
+        default="bf16",
+        choices=["bf16", "fp32", "autocast_bf16"],
+        help="Which runtime dtype to perform generation in.",
+    )
+    args = parser.parse_args()
+    # Setup logging
+    logging.basicConfig(
+        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
+        datefmt="%m/%d/%Y %H:%M:%S",
+        handlers=[logging.StreamHandler(sys.stdout)],
+    )
+    logger.setLevel(logging.INFO)
+    logger.info(f"Arguments: {args}")
+
+    # Set video resolution
+    kwargs_call = {}
+    if args.width > 0 and args.height > 0:
+        kwargs_call["width"] = args.width
+        kwargs_call["height"] = args.height
+    kwargs_call["num_frames"] = args.num_frames
+
+    gaudi_config_kwargs = {"use_fused_adam": True, "use_fused_clip_norm": True}
+    if args.dtype == "autocast_bf16":
+        gaudi_config_kwargs["use_torch_autocast"] = True
+
+    gaudi_config = GaudiConfig(**gaudi_config_kwargs)
+    logger.info(f"Gaudi Config: {gaudi_config}")
+
+    kwargs = {
+        "use_habana": args.use_habana,
+        "use_hpu_graphs": args.use_hpu_graphs,
+        "gaudi_config": gaudi_config,
+    }
+    if args.dtype == "bf16":
+        kwargs["torch_dtype"] = torch.bfloat16
+    elif args.dtype == "fp32":
+        kwargs["torch_dtype"] = torch.float32
+
+    # Generate images
+    pipeline: GaudiTextToVideoSDPipeline = GaudiTextToVideoSDPipeline.from_pretrained(
+        args.model_name_or_path, **kwargs
+    )
+    set_seed(args.seed)
+    outputs = pipeline(
+        prompt=args.prompts,
+        num_videos_per_prompt=args.num_videos_per_prompt,
+        batch_size=args.batch_size,
+        num_inference_steps=args.num_inference_steps,
+        guidance_scale=args.guidance_scale,
+        negative_prompt=args.negative_prompts,
+        eta=args.eta,
+        output_type="pil" if args.output_type == "mp4" else args.output_type,  # Naming inconsistency in base class
+        **kwargs_call,
+    )
+
+    # Save the pipeline in the specified directory if not None
+    if args.pipeline_save_dir is not None:
+        pipeline.save_pretrained(args.pipeline_save_dir)
+
+    # Save images in the specified directory if not None and if they are in PIL format
+    if args.video_save_dir is not None:
+        if args.output_type == "mp4":
+            video_save_dir = Path(args.video_save_dir)
+            video_save_dir.mkdir(parents=True, exist_ok=True)
+            logger.info(f"Saving images in {video_save_dir.resolve()}...")
+
+            for i, video in enumerate(outputs.videos):
+                filename = video_save_dir / f"video_{i + 1}.mp4"
+                export_to_video(video, str(filename.resolve()))
+        else:
+            logger.warning("--output_type should be equal to 'mp4' to save images in --video_save_dir.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/habana/diffusers/__init__.py b/optimum/habana/diffusers/__init__.py
index 860a97e382..49ea1851bf 100644
--- a/optimum/habana/diffusers/__init__.py
+++ b/optimum/habana/diffusers/__init__.py
@@ -17,4 +17,5 @@
 from .pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_img2img import GaudiStableDiffusionXLImg2ImgPipeline
 from .pipelines.stable_diffusion_xl.pipeline_stable_diffusion_xl_inpaint import GaudiStableDiffusionXLInpaintPipeline
 from .pipelines.stable_video_diffusion.pipeline_stable_video_diffusion import GaudiStableVideoDiffusionPipeline
+from .pipelines.text_to_video_synthesis.pipeline_text_to_video_synth import GaudiTextToVideoSDPipeline
 from .schedulers import GaudiDDIMScheduler, GaudiEulerAncestralDiscreteScheduler, GaudiEulerDiscreteScheduler
diff --git a/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py b/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
new file mode 100644
index 0000000000..ffaf25df11
--- /dev/null
+++ b/optimum/habana/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py
@@ -0,0 +1,465 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from math import ceil
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from diffusers.models import AutoencoderKL, UNet3DConditionModel
+from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_synth import TextToVideoSDPipeline
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import logging
+from diffusers.utils.outputs import BaseOutput
+from transformers import CLIPTextModel, CLIPTokenizer
+
+from ....transformers.gaudi_configuration import GaudiConfig
+from ..pipeline_utils import GaudiDiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+@dataclass
+class GaudiTextToVideoSDPipelineOutput(BaseOutput):
+    videos: Union[List[PIL.Image.Image], np.ndarray]
+
+
+class GaudiTextToVideoSDPipeline(GaudiDiffusionPipeline, TextToVideoSDPipeline):
+    r"""
+    Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/pipelines/text_to_video_synthesis/pipeline_text_to_video_synth.py#L84
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        use_habana: bool = False,
+        use_hpu_graphs: bool = False,
+        gaudi_config: Union[str, GaudiConfig] = None,
+        bf16_full_eval: bool = False,
+    ):
+        GaudiDiffusionPipeline.__init__(
+            self,
+            use_habana,
+            use_hpu_graphs,
+            gaudi_config,
+            bf16_full_eval,
+        )
+        TextToVideoSDPipeline.__init__(
+            self,
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+        )
+        self.to(self._device)
+
+    def enable_model_cpu_offload(self, *args, **kwargs):
+        if self.use_habana:
+            raise NotImplementedError("enable_model_cpu_offload() is not implemented for HPU")
+        else:
+            return super().enable_model_cpu_offload(*args, **kwargs)
+
+    def prepare_latents(
+        self, batch_size, num_channels_latents, num_frames, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            num_frames,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+
+        if latents is None:
+            # torch.randn is broken on HPU so running it on CPU
+            rand_device = "cpu" if device.type == "hpu" else device
+            if isinstance(generator, list):
+                shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            latents = latents.to(device)
+
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+
+    # Copied from optimum.habana.diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.GaudiStableDiffusionPipeline._split_inputs_into_batches
+    @classmethod
+    def _split_inputs_into_batches(cls, batch_size, latents, prompt_embeds, negative_prompt_embeds):
+        # Use torch.split to generate num_batches batches of size batch_size
+        latents_batches = list(torch.split(latents, batch_size))
+        prompt_embeds_batches = list(torch.split(prompt_embeds, batch_size))
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds_batches = list(torch.split(negative_prompt_embeds, batch_size))
+
+        # If the last batch has less samples than batch_size, pad it with dummy samples
+        num_dummy_samples = 0
+        if latents_batches[-1].shape[0] < batch_size:
+            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
+            # Pad latents_batches
+            sequence_to_stack = (latents_batches[-1],) + tuple(
+                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            latents_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad prompt_embeds_batches
+            sequence_to_stack = (prompt_embeds_batches[-1],) + tuple(
+                torch.zeros_like(prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad negative_prompt_embeds_batches if necessary
+            if negative_prompt_embeds is not None:
+                sequence_to_stack = (negative_prompt_embeds_batches[-1],) + tuple(
+                    torch.zeros_like(negative_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+                )
+                negative_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+
+        # Stack batches in the same tensor
+        latents_batches = torch.stack(latents_batches)
+        if negative_prompt_embeds is not None:
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            for i, (negative_prompt_embeds_batch, prompt_embeds_batch) in enumerate(
+                zip(negative_prompt_embeds_batches, prompt_embeds_batches[:])
+            ):
+                prompt_embeds_batches[i] = torch.cat([negative_prompt_embeds_batch, prompt_embeds_batch])
+        prompt_embeds_batches = torch.stack(prompt_embeds_batches)
+
+        return latents_batches, prompt_embeds_batches, num_dummy_samples
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_frames: int = 16,
+        batch_size: int = 1,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 9.0,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: int = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "np",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: int = 1,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide video generation. If not defined, you need to pass `prompt_embeds`.
+            height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The height in pixels of the generated video.
+            width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`):
+                The width in pixels of the generated video.
+            num_frames (`int`, *optional*, defaults to 16):
+                The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
+                amounts to 2 seconds of video.
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of videos in a batch.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality videos at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate videos closely linked to the text
+                `prompt` at the expense of lower video quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in video generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_videos_per_prompt (`int`, defaults to 1):
+                The number of videos to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for video
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor is generated by sampling using the supplied random `generator`. Latents should be of shape
+                `(batch_size, num_channel, num_frames, height, width)`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"np"`):
+                The output format of the generated video. Choose between `torch.FloatTensor` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] instead
+                of a plain tuple.
+            callback (`Callable`, *optional*):
+                A function that calls every `callback_steps` steps during inference. The function is called with the
+                following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function is called. If not specified, the callback is called at
+                every step.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+        Examples:
+
+        Returns:
+            [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.text_to_video_synthesis.TextToVideoSDPipelineOutput`] is
+                returned, otherwise a `tuple` is returned where the first element is a list with the generated frames.
+        """
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+            # 0. Default height and width to unet
+            height = height or self.unet.config.sample_size * self.vae_scale_factor
+            width = width or self.unet.config.sample_size * self.vae_scale_factor
+
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                height,
+                width,
+                callback_steps,
+                negative_prompt,
+                prompt_embeds,
+                negative_prompt_embeds,
+            )
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                num_prompts = 1
+            elif prompt is not None and isinstance(prompt, list):
+                num_prompts = len(prompt)
+            else:
+                num_prompts = prompt_embeds.shape[0]
+            num_videos = num_videos_per_prompt * num_prompts
+            num_batches = ceil((num_videos) / batch_size)
+            logger.info(
+                f"{num_prompts} prompt(s) received, {num_videos_per_prompt} generation(s) per prompt, "
+                f"{batch_size} sample(s) per batch, {num_batches} total batch(es)."
+            )
+            if num_batches < 3:
+                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
+
+            device = self._execution_device
+            # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+            # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+            # corresponds to doing no classifier free guidance.
+            do_classifier_free_guidance = guidance_scale > 1.0
+
+            # 3. Encode input prompt
+            text_encoder_lora_scale = (
+                cross_attention_kwargs.get("scale", None) if cross_attention_kwargs is not None else None
+            )
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_videos_per_prompt,
+                do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=clip_skip,
+            )
+
+            # 4. Prepare timesteps
+            self.scheduler.set_timesteps(num_inference_steps, device=device)
+            timesteps = self.scheduler.timesteps
+
+            # 5. Prepare latent variables
+            num_channels_latents = self.unet.config.in_channels
+            latents = self.prepare_latents(
+                num_prompts * num_videos_per_prompt,
+                num_channels_latents,
+                num_frames,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+
+            # 6. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 7. Split into batches (HPU-specific step)
+            latents_batches, text_embeddings_batches, num_dummy_samples = self._split_inputs_into_batches(
+                batch_size,
+                latents,
+                prompt_embeds,
+                negative_prompt_embeds,
+            )
+
+            # 8. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            outputs = []
+            for j in self.progress_bar(range(num_batches)):
+                latents_batch = latents_batches[0]
+                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
+                text_embeddings_batch = text_embeddings_batches[0]
+                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
+                for i in self.progress_bar(range(len(timesteps))):
+                    t = timesteps[0]
+                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = (
+                        torch.cat([latents_batch] * 2) if do_classifier_free_guidance else latents_batch
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet_hpu(
+                        latent_model_input,
+                        t,
+                        text_embeddings_batch,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )
+
+                    # perform guidance
+                    if do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # reshape latents
+                    bsz, channel, frames, width, height = latents_batch.shape
+                    latents_batch = latents_batch.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+                    noise_pred = noise_pred.permute(0, 2, 1, 3, 4).reshape(bsz * frames, channel, width, height)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_batch = self.scheduler.step(noise_pred, t, latents_batch, **extra_step_kwargs).prev_sample
+
+                    # reshape latents_batch back
+                    latents_batch = (
+                        latents_batch[None, :].reshape(bsz, frames, channel, width, height).permute(0, 2, 1, 3, 4)
+                    )
+
+                    if not self.use_hpu_graphs:
+                        self.htcore.mark_step()
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents_batch)
+                if output_type == "latent":
+                    video_tensor = latents_batch
+                else:
+                    video_tensor = self.decode_latents(latents_batch)
+                outputs.append(video_tensor)
+
+                if not self.use_hpu_graphs:
+                    self.htcore.mark_step()
+
+            # Remove dummy generations if needed
+            if num_dummy_samples > 0:
+                outputs[-1] = outputs[-1][:-num_dummy_samples]
+
+            # 9. Post processing
+            videos = []
+            for video_tensor in outputs:
+                if output_type == "latent":
+                    videos.extend(list(video_tensor))
+                    continue
+                video_batch = self.video_processor.postprocess_video(video=video_tensor, output_type=output_type)
+
+                if output_type == "pil" and isinstance(video_batch, list):
+                    videos += video_batch
+                elif output_type in ["np", "numpy"] and isinstance(video_batch, np.ndarray):
+                    if len(videos) == 0:
+                        videos = video_batch
+                    else:
+                        videos = np.concatenate((videos, video_batch), axis=0)
+                else:  # Torch Tensor
+                    if len(videos) == 0:
+                        videos = video_batch
+                    else:
+                        videos = torch.cat((videos, video_batch), 0)
+
+            # Offload all models
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (videos,)
+
+            return GaudiTextToVideoSDPipelineOutput(videos=videos)
+
+    @torch.no_grad()
+    def unet_hpu(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs):
+        if self.use_hpu_graphs:
+            return self.capture_replay(latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs)
+        else:
+            return self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+    @torch.no_grad()
+    def capture_replay(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs):
+        inputs = [latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs, False]
+        h = self.ht.hpu.graphs.input_hash(inputs)
+        cached = self.cache.get(h)
+
+        if cached is None:
+            # Capture the graph and cache it
+            with self.ht.hpu.stream(self.hpu_stream):
+                graph = self.ht.hpu.HPUGraph()
+                graph.capture_begin()
+                outputs = self.unet(
+                    inputs[0], inputs[1], inputs[2], cross_attention_kwargs=inputs[3], return_dict=inputs[4]
+                )[0]
+                graph.capture_end()
+                graph_inputs = inputs
+                graph_outputs = outputs
+                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
+            return outputs
+
+        # Replay the cached graph with updated inputs
+        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
+        cached.graph.replay()
+        self.ht.core.hpu.default_stream().synchronize()
+
+        return cached.graph_outputs
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index fd4d0fca08..98edfc2243 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -24,6 +24,7 @@
 import re
 import subprocess
 import tempfile
+import time
 from io import BytesIO, StringIO
 from pathlib import Path
 from typing import Callable, Union
@@ -51,6 +52,7 @@
     StableVideoDiffusionPipeline,
     UNet2DConditionModel,
     UNet2DModel,
+    UNet3DConditionModel,
     UNetSpatioTemporalConditionModel,
     UniPCMultistepScheduler,
 )
@@ -101,6 +103,7 @@
     GaudiStableDiffusionXLInpaintPipeline,
     GaudiStableDiffusionXLPipeline,
     GaudiStableVideoDiffusionPipeline,
+    GaudiTextToVideoSDPipeline,
 )
 from optimum.habana.utils import set_seed
 
@@ -119,6 +122,7 @@
     CONTROLNET_RUNTIME = 537.4276602957398
     INPAINT_THROUGHPUT_BASELINE_BF16 = 4.584
     INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 1.151
+    TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 70
     DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.946
     THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 7.671212047338486
 else:
@@ -132,6 +136,7 @@
     INPAINT_XL_THROUGHPUT_BASELINE_BF16 = 0.271
     DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.302
     THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 3.095533166996529
+    TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 1000  # TODO: Get Gaudi 1 benchmark numbers
 
 
 _run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
@@ -3009,6 +3014,128 @@ def test_deterministic_image_generation_no_throughput_regression_bf16(self):
         self.assertGreaterEqual(outputs.throughput, 0.95 * DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT)
 
 
+class GaudiTextToVideoSDPipelineTester(TestCase):
+    """
+    Tests the TextToVideoSDPipeline for Gaudi.
+    Adapted from https://github.com/huggingface/diffusers/blob/v0.24.0-release/tests/pipelines/text_to_video_synthesis/test_text_to_video.py
+    """
+
+    def get_dummy_components(self):
+        set_seed(0)
+        unet = UNet3DConditionModel(
+            block_out_channels=(4, 8),
+            layers_per_block=1,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("CrossAttnDownBlock3D", "DownBlock3D"),
+            up_block_types=("UpBlock3D", "CrossAttnUpBlock3D"),
+            cross_attention_dim=4,
+            attention_head_dim=4,
+            norm_num_groups=2,
+        )
+        scheduler = GaudiEulerDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            interpolation_type="linear",
+            num_train_timesteps=1000,
+            prediction_type="v_prediction",
+            sigma_max=700.0,
+            sigma_min=0.002,
+            steps_offset=1,
+            timestep_spacing="leading",
+            timestep_type="continuous",
+            trained_betas=None,
+            use_karras_sigmas=True,
+        )
+        set_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=(8,),
+            in_channels=3,
+            out_channels=3,
+            down_block_types=("DownEncoderBlock2D",),
+            up_block_types=("UpDecoderBlock2D",),
+            latent_channels=4,
+            sample_size=32,
+            norm_num_groups=2,
+        )
+        set_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=4,
+            intermediate_size=16,
+            layer_norm_eps=1e-05,
+            num_attention_heads=2,
+            num_hidden_layers=2,
+            pad_token_id=1,
+            vocab_size=1000,
+            hidden_act="gelu",
+            projection_dim=32,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "numpy",
+        }
+        return inputs
+
+    def test_text_to_video_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        gaudi_config = GaudiConfig(use_torch_autocast=False)
+        sd_pipe = GaudiTextToVideoSDPipeline(use_habana=True, gaudi_config=gaudi_config, **components)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["output_type"] = "np"
+        video = sd_pipe(**inputs).videos[0]
+        image_slice = video[0][-3:, -3:, -1]
+
+        assert video[0].shape == (32, 32, 3)
+        expected_slice = np.array(
+            [0.32823694, 0.5277065, 0.5257378, 0.51532686, 0.62792695, 0.5966803, 0.55225205, 0.6153607, 0.60387087]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    @slow
+    def test_stable_video_diffusion_no_latency_regression_bf16(self):
+        model_name = "ali-vilab/text-to-video-ms-1.7b"
+        pipeline = GaudiTextToVideoSDPipeline.from_pretrained(
+            model_name,
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config=GaudiConfig.from_pretrained("Habana/stable-diffusion"),
+            torch_dtype=torch.bfloat16,
+        )
+        set_seed(42)
+        start_time = time.time()
+        prompt = "Spiderman is surfing"
+        outputs = pipeline(prompt, num_inference_steps=50, output_type="pil")
+        latency = time.time() - start_time
+        assert len(outputs.videos[0]) == 16
+
+        assert latency < 1.05 * TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE
+
+
 """
 Copied from: https://github.com/huggingface/diffusers/blob/v0.26.3/tests/pipelines/test_pipelines_common.py
 - Remove PipelinePushToHubTester testcase.

From a1a92c9082222f08d9828f3601f050f62311216b Mon Sep 17 00:00:00 2001
From: Pramod Kumar <144990617+pramodkumar-habanalabs@users.noreply.github.com>
Date: Fri, 30 Aug 2024 17:37:08 +0530
Subject: [PATCH 19/41] Prevent Graph break in Llama when using flash attention
 (#1301)

---
 optimum/habana/transformers/models/llama/modeling_llama.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
index 7d41126390..1abbfab12d 100755
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -617,7 +617,7 @@ def pre_attn_forward(
         else:
             past_key_value = None
 
-        if use_flash_attention and FusedSDPA:
+        if use_flash_attention and FusedSDPA is not None:
             import habana_frameworks.torch.hpu as ht
 
             softmax_mode = "fast" if flash_attention_fast_softmax else "None"

From 3eae3a2e61642a854adccb33faa0e4e50fc10f80 Mon Sep 17 00:00:00 2001
From: yuanwu2017 <yuan.wu@intel.com>
Date: Wed, 4 Sep 2024 23:59:34 +0800
Subject: [PATCH 20/41] Add the deepspeed injection_policy of mistral (#1309)

Signed-off-by: yuanwu <yuan.wu@intel.com>
---
 optimum/habana/checkpoint_utils.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/optimum/habana/checkpoint_utils.py b/optimum/habana/checkpoint_utils.py
index aa88252868..6a6001d5e0 100644
--- a/optimum/habana/checkpoint_utils.py
+++ b/optimum/habana/checkpoint_utils.py
@@ -149,4 +149,9 @@ def get_ds_injection_policy(config):
 
             policy = {LlamaDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
 
+        if model_type == "mistral":
+            from transformers.models.mistral.modeling_mistral import MistralDecoderLayer
+
+            policy = {MistralDecoderLayer: ("self_attn.o_proj", "mlp.down_proj")}
+
     return policy

From 9fd4baccd0c8384fa75af53df15bd72ca61157cc Mon Sep 17 00:00:00 2001
From: "Wang, Chang" <chang1.wang@intel.com>
Date: Thu, 5 Sep 2024 00:01:45 +0800
Subject: [PATCH 21/41] Fix text-generation example lm_eval evaluation (#1308)

Signed-off-by: changwangss <chang1.wang@intel.com>
---
 examples/text-generation/run_lm_eval.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
index 54b9625880..f375e2de60 100644
--- a/examples/text-generation/run_lm_eval.py
+++ b/examples/text-generation/run_lm_eval.py
@@ -199,7 +199,8 @@ def main():
         lm = HabanaModelAdapter(tokenizer, model, args, generation_config)
 
     eval_start = time.perf_counter()
-    results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit_iters)
+    with torch.no_grad():
+        results = lm_eval.evaluator.evaluate(lm, lm_tasks, limit=args.limit_iters)
     if args.device == "hpu":
         import habana_frameworks.torch.hpu as torch_hpu
 

From 085b8e9c3832650a35c7a05b6104e286f0e01576 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Wed, 4 Sep 2024 23:34:34 +0200
Subject: [PATCH 22/41] Add section to README about Transformers development
 branch (#1307)

---
 README.md | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/README.md b/README.md
index 2decc5ec47..89dccab62b 100644
--- a/README.md
+++ b/README.md
@@ -72,6 +72,18 @@ pip install git+https://github.com/huggingface/optimum-habana.git
 git clone https://github.com/huggingface/optimum-habana
 ```
 
+### Option 3: Use the `transformers_future` branch to have the latest changes from Transformers
+
+The `transformers_future` branch is regularly updated with the latest changes from the main branches of Optimum Habana and Transformers. This enables you to try out new Transformers features that have not been merged into the main branch yet.
+
+> [!WARNING]
+> The `transformers_future` branch may have some regressions or bugs and may be less stable than the main branch.
+
+```bash
+pip install git+https://github.com/huggingface/optimum-habana.git@transformers_future
+git clone -b transformers_future https://github.com/huggingface/optimum-habana
+```
+
 ## Install dependencies
 
 To use DeepSpeed on HPUs, you also need to run the following command:

From 19cb6f7e6549fade1a893458b483141cbbdfeef1 Mon Sep 17 00:00:00 2001
From: Daniel Huang <daniel1.huang@intel.com>
Date: Thu, 5 Sep 2024 02:42:54 -0700
Subject: [PATCH 23/41] Depth to Image Generation (#1175)

Co-authored-by: Raymond Lau <kwun.fung.lau@intel.com>
---
 README.md                                     |   1 +
 docs/source/index.mdx                         |   1 +
 examples/stable-diffusion/README.md           |  15 +
 .../depth_to_image_generation.py              | 286 +++++++++
 optimum/habana/diffusers/__init__.py          |   1 +
 .../pipeline_stable_diffusion_depth2img.py    | 558 ++++++++++++++++++
 .../diffusers/schedulers/scheduling_ddim.py   |   5 +-
 tests/test_diffusers.py                       | 248 ++++++++
 8 files changed, 1114 insertions(+), 1 deletion(-)
 create mode 100755 examples/stable-diffusion/depth_to_image_generation.py
 create mode 100644 optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py

diff --git a/README.md b/README.md
index 89dccab62b..c52bd21225 100644
--- a/README.md
+++ b/README.md
@@ -241,6 +241,7 @@ The following model architectures, tasks and device distributions have been vali
 |------------------|:--------:|:--------------------:|:------|
 | Stable Diffusion | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
+| Stable Diffusion Depth2img | | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D            |          | <li>Single card</li> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
 
diff --git a/docs/source/index.mdx b/docs/source/index.mdx
index 17f099f8ee..3131e0cfdc 100644
--- a/docs/source/index.mdx
+++ b/docs/source/index.mdx
@@ -83,6 +83,7 @@ In the tables below, ✅ means single-card, multi-card and DeepSpeed have all be
 |---------------------|:--------:|:---------:|:------|
 | Stable Diffusion    | <li>[textual inversion](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#textual-inversion)</li><li>[ControlNet](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#controlnet-training)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Stable Diffusion XL | <li>[fine-tuning](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion/training#fine-tuning-for-stable-diffusion-xl)</li> | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
+| Stable Diffusion Depth2img | | <li>Single card</li> | <li>[depth-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | LDM3D               |          | <div style="text-align:left"><li>Single card</li></div> | <li>[text-to-image generation](https://github.com/huggingface/optimum-habana/tree/main/examples/stable-diffusion)</li> |
 | Text to Video    |          | <li>Single card</li> | <li>[text-to-video generation](https://github.com/huggingface/optimum-habana/tree/main/examples/text-to-video)</li> |
 
diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
index c4a87a1af9..392d79338a 100644
--- a/examples/stable-diffusion/README.md
+++ b/examples/stable-diffusion/README.md
@@ -520,6 +520,21 @@ python image_to_image_generation.py \
     --bf16
 ```
 
+### Depth to Image Generation
+
+Here is how to generate a depth2img-guided image generation using HPU graphs with BF16:
+
+```bash
+python depth_to_image_generation.py \
+    --model_name_or_path "stabilityai/stable-diffusion-2-depth" \
+    --prompts "two tigers" \
+    --base_image "http://images.cocodataset.org/val2017/000000039769.jpg" \
+    --image_save_dir /tmp/stable_diffusion_images \
+    --use_habana \
+    --use_hpu_graphs \
+    --bf16
+```
+
 ## Unconditional Image Generation Example
 
 Here is how to perform unconditional-image-generation on Gaudi/HPU.
diff --git a/examples/stable-diffusion/depth_to_image_generation.py b/examples/stable-diffusion/depth_to_image_generation.py
new file mode 100755
index 0000000000..570a39b2c3
--- /dev/null
+++ b/examples/stable-diffusion/depth_to_image_generation.py
@@ -0,0 +1,286 @@
+#!/usr/bin/env python
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+import argparse
+import logging
+import os
+from pathlib import Path
+
+import requests
+import torch
+from diffusers.schedulers.scheduling_pndm import PNDMScheduler
+from PIL import Image
+
+from optimum.habana.diffusers import (
+    GaudiDDIMScheduler,
+    GaudiEulerAncestralDiscreteScheduler,
+    GaudiEulerDiscreteScheduler,
+    GaudiStableDiffusionDepth2ImgPipeline,
+)
+from optimum.habana.utils import set_seed
+
+
+try:
+    from optimum.habana.utils import check_optimum_habana_min_version
+except ImportError:
+
+    def check_optimum_habana_min_version(*a, **b):
+        return ()
+
+
+# Will error if the minimal version of Optimum Habana is not installed. Remove at your own risks.
+check_optimum_habana_min_version("1.14.0.dev0")
+
+
+logger = logging.getLogger(__name__)
+
+
+def main():
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+
+    parser.add_argument(
+        "--model_name_or_path",
+        default="stabilityai/stable-diffusion-2-depth",
+        type=str,
+        help="Path to pre-trained model",
+    )
+
+    parser.add_argument(
+        "--scheduler",
+        default="ddim",
+        choices=["euler_discrete", "euler_ancestral_discrete", "ddim", "pndm"],
+        type=str,
+        help="Name of scheduler",
+    )
+
+    parser.add_argument(
+        "--timestep_spacing",
+        default="linspace",
+        choices=["linspace", "leading", "trailing"],
+        type=str,
+        help="The way the timesteps should be scaled.",
+    )
+    # Pipeline arguments
+    parser.add_argument(
+        "--prompts",
+        type=str,
+        nargs="*",
+        default="two tigers",
+        help="The prompt or prompts to guide the image generation.",
+    )
+    parser.add_argument(
+        "--base_image",
+        type=str,
+        required=True,
+        help=("Path or URL to inpaint base image"),
+    )
+    parser.add_argument(
+        "--num_images_per_prompt", type=int, default=1, help="The number of images to generate per prompt."
+    )
+    parser.add_argument("--batch_size", type=int, default=1, help="The number of images in a batch.")
+    parser.add_argument(
+        "--height",
+        type=int,
+        default=0,
+        help="The height in pixels of the generated images (0=default from model config).",
+    )
+    parser.add_argument(
+        "--width",
+        type=int,
+        default=0,
+        help="The width in pixels of the generated images (0=default from model config).",
+    )
+    parser.add_argument(
+        "--num_inference_steps",
+        type=int,
+        default=50,
+        help=(
+            "The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense"
+            " of slower inference."
+        ),
+    )
+    parser.add_argument(
+        "--guidance_scale",
+        type=float,
+        default=7.5,
+        help=(
+            "Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598)."
+            " Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,"
+            " usually at the expense of lower image quality."
+        ),
+    )
+    parser.add_argument(
+        "--negative_prompts",
+        type=str,
+        nargs="*",
+        default=None,
+        help="The prompt or prompts not to guide the image generation.",
+    )
+    parser.add_argument(
+        "--eta",
+        type=float,
+        default=0.0,
+        help="Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502.",
+    )
+    parser.add_argument(
+        "--output_type",
+        type=str,
+        choices=["pil", "np"],
+        default="pil",
+        help="Whether to return PIL images or Numpy arrays.",
+    )
+
+    parser.add_argument(
+        "--pipeline_save_dir",
+        type=str,
+        default=None,
+        help="The directory where the generation pipeline will be saved.",
+    )
+    parser.add_argument(
+        "--image_save_dir",
+        type=str,
+        default="./stable-diffusion-generated-images",
+        help="The directory where images will be saved.",
+    )
+
+    parser.add_argument("--seed", type=int, default=42, help="Random seed for initialization.")
+
+    # HPU-specific arguments
+    parser.add_argument("--use_habana", action="store_true", help="Use HPU.")
+    parser.add_argument(
+        "--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU. This should lead to faster generations."
+    )
+    parser.add_argument(
+        "--gaudi_config_name",
+        type=str,
+        default="Habana/stable-diffusion",
+        help=(
+            "Name or path of the Gaudi configuration. In particular, it enables to specify how to apply Habana Mixed"
+            " Precision."
+        ),
+    )
+    parser.add_argument("--bf16", action="store_true", help="Whether to perform generation in bf16 precision.")
+    parser.add_argument(
+        "--throughput_warmup_steps",
+        type=int,
+        default=None,
+        help="Number of steps to ignore for throughput calculation.",
+    )
+    parser.add_argument(
+        "--profiling_warmup_steps",
+        type=int,
+        default=0,
+        help="Number of steps to ignore for profiling.",
+    )
+    parser.add_argument(
+        "--profiling_steps",
+        type=int,
+        default=0,
+        help="Number of steps to capture for profiling.",
+    )
+    parser.add_argument(
+        "--use_cpu_rng",
+        action="store_true",
+        help="Enable deterministic generation using CPU Generator",
+    )
+    args = parser.parse_args()
+
+    # Set image resolution
+    kwargs_call = {}
+    if args.width > 0 and args.height > 0:
+        kwargs_call["width"] = args.width
+        kwargs_call["height"] = args.height
+
+    # Initialize the scheduler and the generation pipeline
+    kwargs = {"timestep_spacing": args.timestep_spacing}
+    if args.scheduler == "euler_discrete":
+        scheduler = GaudiEulerDiscreteScheduler.from_pretrained(
+            args.model_name_or_path, subfolder="scheduler", **kwargs
+        )
+    elif args.scheduler == "euler_ancestral_discrete":
+        scheduler = GaudiEulerAncestralDiscreteScheduler.from_pretrained(
+            args.model_name_or_path, subfolder="scheduler", **kwargs
+        )
+    elif args.scheduler == "ddim":
+        scheduler = GaudiDDIMScheduler.from_pretrained(args.model_name_or_path, subfolder="scheduler", **kwargs)
+    else:
+        scheduler = PNDMScheduler.from_pretrained(args.model_name_or_path, subfolder="scheduler", **kwargs)
+
+    kwargs = {
+        "scheduler": scheduler,
+        "use_habana": args.use_habana,
+        "use_hpu_graphs": args.use_hpu_graphs,
+        "gaudi_config": args.gaudi_config_name,
+    }
+
+    if args.bf16:
+        kwargs["torch_dtype"] = torch.bfloat16
+
+    kwargs_common = {
+        "num_images_per_prompt": args.num_images_per_prompt,
+        "batch_size": args.batch_size,
+        "num_inference_steps": args.num_inference_steps,
+        "guidance_scale": args.guidance_scale,
+        "negative_prompt": args.negative_prompts,
+        "eta": args.eta,
+        "output_type": args.output_type,
+        "profiling_warmup_steps": args.profiling_warmup_steps,
+        "profiling_steps": args.profiling_steps,
+    }
+
+    kwargs_call.update(kwargs_common)
+    if os.path.exists(args.base_image):
+        kwargs_call["image"] = Image.open(args.base_image)
+    else:
+        kwargs_call["image"] = Image.open(requests.get(args.base_image, stream=True).raw)
+    if args.throughput_warmup_steps is not None:
+        kwargs_call["throughput_warmup_steps"] = args.throughput_warmup_steps
+
+    if args.use_cpu_rng:
+        # Patch for the deterministic generation - Need to specify CPU as the torch generator
+        generator = torch.Generator(device="cpu").manual_seed(args.seed)
+    else:
+        generator = None
+    kwargs_call["generator"] = generator
+
+    # Generate images
+    pipeline: GaudiStableDiffusionDepth2ImgPipeline = GaudiStableDiffusionDepth2ImgPipeline.from_pretrained(  # type: ignore
+        args.model_name_or_path,
+        **kwargs,
+    )
+    set_seed(args.seed)
+
+    outputs = pipeline(prompt=args.prompts, **kwargs_call)
+
+    # Save the pipeline in the specified directory if not None
+    if args.pipeline_save_dir is not None:
+        save_dir = args.pipeline_save_dir
+        pipeline.save_pretrained(save_dir)
+
+    # Save images in the specified directory if not None and if they are in PIL format
+    if args.image_save_dir is not None:
+        if args.output_type == "pil":
+            image_save_dir = Path(args.image_save_dir)
+
+            image_save_dir.mkdir(parents=True, exist_ok=True)
+            logger.info(f"Saving images in {image_save_dir.resolve()}...")
+            for i, image in enumerate(outputs.images):
+                image.save(image_save_dir / f"image_{i + 1}.png")
+        else:
+            logger.warning("--output_type should be equal to 'pil' to save images in --image_save_dir.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/optimum/habana/diffusers/__init__.py b/optimum/habana/diffusers/__init__.py
index 49ea1851bf..82f3bae0cb 100644
--- a/optimum/habana/diffusers/__init__.py
+++ b/optimum/habana/diffusers/__init__.py
@@ -3,6 +3,7 @@
 from .pipelines.ddpm.pipeline_ddpm import GaudiDDPMPipeline
 from .pipelines.pipeline_utils import GaudiDiffusionPipeline
 from .pipelines.stable_diffusion.pipeline_stable_diffusion import GaudiStableDiffusionPipeline
+from .pipelines.stable_diffusion.pipeline_stable_diffusion_depth2img import GaudiStableDiffusionDepth2ImgPipeline
 from .pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation import (
     GaudiStableDiffusionImageVariationPipeline,
 )
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
new file mode 100644
index 0000000000..5432388229
--- /dev/null
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py
@@ -0,0 +1,558 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from diffusers import ImagePipelineOutput
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_depth2img import (
+    StableDiffusionDepth2ImgPipeline,
+    retrieve_latents,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate
+from transformers import CLIPTextModel, CLIPTokenizer, DPTFeatureExtractor, DPTForDepthEstimation
+
+from optimum.utils import logging
+
+from ....transformers.gaudi_configuration import GaudiConfig
+from ..pipeline_utils import GaudiDiffusionPipeline
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device="cpu", **kwargs)
+        timesteps = scheduler.timesteps.to(device)
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device="cpu", **kwargs)
+        timesteps = scheduler.timesteps.to(device)
+
+    # Handles the case where the scheduler cannot implement reset_timestep_dependent_params()
+    # Example: UniPCMultiStepScheduler used for inference in ControlNet training as it has non-linear accesses to timestep dependent parameter: sigma.
+    if hasattr(scheduler, "reset_timestep_dependent_params") and callable(scheduler.reset_timestep_dependent_params):
+        scheduler.reset_timestep_dependent_params()
+    return timesteps, num_inference_steps
+
+
+class GaudiStableDiffusionDepth2ImgPipeline(GaudiDiffusionPipeline, StableDiffusionDepth2ImgPipeline):
+    r"""
+    Adapted from: https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_depth2img.py#L77
+
+    Changes:
+        - Add HPU Graphs
+        - Depth map is now generated by CPU
+        - Changed the logic of setting timestep
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        depth_estimator: DPTForDepthEstimation,
+        feature_extractor: DPTFeatureExtractor,
+        use_habana: bool = False,
+        use_hpu_graphs: bool = False,
+        gaudi_config: Union[str, GaudiConfig] = None,
+        bf16_full_eval: bool = False,
+    ):
+        GaudiDiffusionPipeline.__init__(
+            self,
+            use_habana,
+            use_hpu_graphs,
+            gaudi_config,
+            bf16_full_eval,
+        )
+
+        StableDiffusionDepth2ImgPipeline.__init__(
+            self,
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            depth_estimator,
+            feature_extractor,
+        )
+
+        self.to(self._device)
+
+    # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.prepare_latents
+    def prepare_latents(self, image, timestep, batch_size, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = batch_size * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)  # run this
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        shape = init_latents.shape
+        noise = torch.randn(shape, generator=generator, device="cpu", dtype=dtype)  # HPU Patch
+        noise = noise.to(device)  # HPU Patch
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    def prepare_depth_map(self, image, depth_map, batch_size, do_classifier_free_guidance, dtype, device):
+        if isinstance(image, PIL.Image.Image):
+            image = [image]
+        else:
+            image = list(image)
+
+        if isinstance(image[0], PIL.Image.Image):
+            width, height = image[0].size
+        elif isinstance(image[0], np.ndarray):
+            width, height = image[0].shape[:-1]
+        else:
+            height, width = image[0].shape[-2:]
+
+        if depth_map is None:
+            pixel_values = self.feature_extractor(images=image, return_tensors="pt").pixel_values  # ok
+            pixel_values = pixel_values.to(device=device)
+            # The DPT-Hybrid model uses batch-norm layers which are not compatible with fp16.
+            # So we use `torch.autocast` here for half precision inference.
+
+            # --- HPU Patch --- #
+            with torch.autocast(device.type, dtype=dtype):  # HPU Patch
+                if dtype == torch.bfloat16:  # HPU Patch
+                    pixel_values = pixel_values.to(torch.bfloat16)  # HPU Patch
+
+                self.depth_estimator = self.depth_estimator.to("cpu")  # HPU Patch
+                pixel_values = pixel_values.to("cpu")  # HPU
+                depth_map = self.depth_estimator(pixel_values).predicted_depth
+                depth_map = depth_map.to(device)
+            # --- HPU Patch --- #
+
+        else:
+            depth_map = depth_map.to(device=device, dtype=dtype)
+
+        depth_map = torch.nn.functional.interpolate(
+            depth_map.unsqueeze(1),
+            size=(height // self.vae_scale_factor, width // self.vae_scale_factor),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        depth_min = torch.amin(depth_map, dim=[1, 2, 3], keepdim=True)
+        depth_max = torch.amax(depth_map, dim=[1, 2, 3], keepdim=True)
+        depth_map = 2.0 * (depth_map - depth_min) / (depth_max - depth_min) - 1.0
+        depth_map = depth_map.to(dtype)
+
+        # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method
+        if depth_map.shape[0] < batch_size:
+            repeat_by = batch_size // depth_map.shape[0]
+            depth_map = depth_map.repeat(repeat_by, 1, 1, 1)
+
+        depth_map = torch.cat([depth_map] * 2) if do_classifier_free_guidance else depth_map
+        return depth_map
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        depth_map: Optional[torch.FloatTensor] = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: Optional[int] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image` or tensor representing an image batch to be used as the starting point. Can accept image
+                latents as `image` only if `depth_map` is not `None`.
+            depth_map (`torch.FloatTensor`, *optional*):
+                Depth prediction to be used as additional conditioning for the image generation process. If not
+                defined, it automatically predicts the depth with `self.depth_estimator`.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+        Examples:
+
+        ```py
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> from diffusers import StableDiffusionDepth2ImgPipeline
+
+        >>> pipe = StableDiffusionDepth2ImgPipeline.from_pretrained(
+        ...     "stabilityai/stable-diffusion-2-depth",
+        ...     torch_dtype=torch.float16,
+        ... )
+        >>> pipe.to("cuda")
+
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> init_image = Image.open(requests.get(url, stream=True).raw)
+        >>> prompt = "two tigers"
+        >>> n_prompt = "bad, deformed, ugly, bad anotomy"
+        >>> image = pipe(prompt=prompt, image=init_image, negative_prompt=n_prompt, strength=0.7).images[0]
+        ```
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+            # 1. Check inputs
+            self.check_inputs(
+                prompt,
+                strength,
+                callback_steps,
+                negative_prompt=negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            )
+
+            self._guidance_scale = guidance_scale
+            self._clip_skip = clip_skip
+            self._cross_attention_kwargs = cross_attention_kwargs
+
+            if image is None:
+                raise ValueError("`image` input cannot be undefined.")
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                batch_size = 1
+            elif prompt is not None and isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                batch_size = prompt_embeds.shape[0]
+
+            device = self._execution_device
+
+            # 3. Encode input prompt
+            text_encoder_lora_scale = (
+                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+            )
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            if self.do_classifier_free_guidance:
+                prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
+
+            # 4. Prepare depth mask
+            depth_mask = self.prepare_depth_map(
+                image,
+                depth_map,
+                batch_size * num_images_per_prompt,
+                self.do_classifier_free_guidance,
+                prompt_embeds.dtype,
+                device,
+            )
+
+            # 5. Preprocess image
+            image = self.image_processor.preprocess(image)
+
+            # 6. Set timesteps
+            timesteps = None  # HPU Patch
+            timesteps, num_inference_steps = retrieve_timesteps(
+                self.scheduler, num_inference_steps, device, timesteps
+            )  # HPU Patch
+            latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)  # HPU Patch
+
+            # 7. Prepare latent variables
+            generator = torch.Generator(device="cpu")
+            generator.manual_seed(1)
+            latents = self.prepare_latents(
+                image, latent_timestep, batch_size, num_images_per_prompt, prompt_embeds.dtype, device, generator
+            )
+
+            # 8. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 9. Denoising loop
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+
+            with self.progress_bar(total=num_inference_steps) as progress_bar:
+                for i in range(num_inference_steps):
+                    # HPU Patch
+                    t = timesteps[0]
+                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                    latent_model_input = torch.cat([latent_model_input, depth_mask], dim=1)
+
+                    # predict the noise residual
+                    noise_pred = self.unet_hpu(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=prompt_embeds,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                    )
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0]
+
+                    if not self.use_hpu_graphs:
+                        self.htcore.mark_step()
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents = callback_outputs.pop("latents", latents)
+                        prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                        negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+                        depth_mask = callback_outputs.pop("depth_mask", depth_mask)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        progress_bar.update()
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+
+            if not output_type == "latent":
+                image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False)[0]
+            else:
+                image = latents
+
+            image = self.image_processor.postprocess(image, output_type=output_type)
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (image,)
+
+            return ImagePipelineOutput(images=image)
+
+    @torch.no_grad()
+    def unet_hpu(self, latent_model_input, timestep, encoder_hidden_states, cross_attention_kwargs):
+        if self.use_hpu_graphs:
+            return self.capture_replay(latent_model_input, timestep, encoder_hidden_states)
+        else:
+            return self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                cross_attention_kwargs=cross_attention_kwargs,
+                return_dict=False,
+            )[0]
+
+    @torch.no_grad()
+    def capture_replay(self, latent_model_input, timestep, encoder_hidden_states):
+        inputs = [latent_model_input, timestep, encoder_hidden_states, False]
+        h = self.ht.hpu.graphs.input_hash(inputs)
+        cached = self.cache.get(h)
+
+        if cached is None:
+            # Capture the graph and cache it
+            with self.ht.hpu.stream(self.hpu_stream):
+                graph = self.ht.hpu.HPUGraph()
+                graph.capture_begin()
+                outputs = self.unet(inputs[0], inputs[1], inputs[2], inputs[3])[0]
+                graph.capture_end()
+                graph_inputs = inputs
+                graph_outputs = outputs
+                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
+            return outputs
+
+        # Replay the cached graph with updated inputs
+        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
+        cached.graph.replay()
+        self.ht.core.hpu.default_stream().synchronize()
+
+        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/schedulers/scheduling_ddim.py b/optimum/habana/diffusers/schedulers/scheduling_ddim.py
index d15420853f..9577b876ad 100644
--- a/optimum/habana/diffusers/schedulers/scheduling_ddim.py
+++ b/optimum/habana/diffusers/schedulers/scheduling_ddim.py
@@ -179,7 +179,7 @@ def roll_params(self):
     #     return sample
 
     def _get_variance(self, alpha_prod_t, alpha_prod_t_prev):
-        beta_prod_t = 1 - alpha_prod_t
+        beta_prod_t = 1 - alpha_prod_t + 1e-8  # For numerical stability
         beta_prod_t_prev = 1 - alpha_prod_t_prev
 
         variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
@@ -323,6 +323,9 @@ def add_noise(
         # Make sure alphas_cumprod has same device and dtype as original_samples
         # Make sure alphas_cumprod and timestep have same device and dtype as original_samples
         self.alphas_cumprod = self.alphas_cumprod.to(device=original_samples.device, dtype=original_samples.dtype)
+        self.final_alpha_cumprod = self.final_alpha_cumprod.to(
+            device=original_samples.device, dtype=original_samples.dtype
+        )
         timesteps = timesteps.to(original_samples.device)
 
         sqrt_alpha_prod = self.alphas_cumprod[timesteps] ** 0.5
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 98edfc2243..8650b0e651 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -81,6 +81,9 @@
     CLIPTokenizer,
     CLIPVisionConfig,
     CLIPVisionModelWithProjection,
+    DPTConfig,
+    DPTFeatureExtractor,
+    DPTForDepthEstimation,
 )
 from transformers.testing_utils import parse_flag_from_env, slow
 
@@ -93,6 +96,7 @@
     GaudiEulerDiscreteScheduler,
     GaudiStableDiffusion3Pipeline,
     GaudiStableDiffusionControlNetPipeline,
+    GaudiStableDiffusionDepth2ImgPipeline,
     GaudiStableDiffusionImageVariationPipeline,
     GaudiStableDiffusionInpaintPipeline,
     GaudiStableDiffusionInstructPix2PixPipeline,
@@ -125,6 +129,7 @@
     TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 70
     DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.946
     THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 7.671212047338486
+    DEPTH2IMG_GENERATION_LATENCY_BASELINE_BF16 = 28.13371205329895
 else:
     THROUGHPUT_BASELINE_BF16 = 0.309
     THROUGHPUT_BASELINE_AUTOCAST = 0.114
@@ -137,6 +142,7 @@
     DETERMINISTIC_IMAGE_GENERATION_THROUGHPUT = 0.302
     THROUGHPUT_UNCONDITIONAL_IMAGE_BASELINE_BF16 = 3.095533166996529
     TEXT_TO_VIDEO_SYNTHESIS_BF16_BASELINE = 1000  # TODO: Get Gaudi 1 benchmark numbers
+    DEPTH2IMG_GENERATION_LATENCY_BASELINE_BF16 = 200  # TODO: Get Gaudi 1 Throughput
 
 
 _run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
@@ -1998,6 +2004,248 @@ def test_stable_diffusion_multicontrolnet_hpu_graphs(self):
         self.assertEqual(images[-1].shape, (64, 64, 3))
 
 
+class GaudiStableDiffusionDepth2ImgPipelineTester(TestCase):
+    """
+    Tests for depth to image generation
+    """
+
+    def get_dummy_components(self):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            sample_size=32,
+            in_channels=5,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+            attention_head_dim=(2, 4),
+            use_linear_projection=True,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        backbone_config = {
+            "global_padding": "same",
+            "layer_type": "bottleneck",
+            "depths": [3, 4, 9],
+            "out_features": ["stage1", "stage2", "stage3"],
+            "embedding_dynamic_padding": True,
+            "hidden_sizes": [96, 192, 384, 768],
+            "num_groups": 2,
+        }
+        depth_estimator_config = DPTConfig(
+            image_size=32,
+            patch_size=16,
+            num_channels=3,
+            hidden_size=32,
+            num_hidden_layers=4,
+            backbone_out_indices=(0, 1, 2, 3),
+            num_attention_heads=4,
+            intermediate_size=37,
+            hidden_act="gelu",
+            hidden_dropout_prob=0.1,
+            attention_probs_dropout_prob=0.1,
+            is_decoder=False,
+            initializer_range=0.02,
+            is_hybrid=True,
+            backbone_config=backbone_config,
+            backbone_featmap_shape=[1, 384, 24, 24],
+        )
+        depth_estimator = DPTForDepthEstimation(depth_estimator_config).eval()
+        feature_extractor = DPTFeatureExtractor.from_pretrained(
+            "hf-internal-testing/tiny-random-DPTForDepthEstimation"
+        )
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "depth_estimator": depth_estimator,
+            "feature_extractor": feature_extractor,
+        }
+        return components
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed))
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize((32, 32))
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def get_dummy_image(self, shape=(1, 3, 32, 32), seed=0):
+        image = floats_tensor(shape, rng=random.Random(seed))
+        image = image.cpu().permute(0, 2, 3, 1)[0]
+        image = Image.fromarray(np.uint8(image)).convert("RGB").resize(shape[-2:])
+        return image
+
+    def test_depth2img_pipeline_default(self):
+        components = self.get_dummy_components()
+        inputs = self.get_dummy_inputs("cpu")
+        gaudi_config = GaudiConfig(use_torch_autocast=False)
+
+        pipe = GaudiStableDiffusionDepth2ImgPipeline(
+            use_habana=True,
+            gaudi_config=gaudi_config,
+            **components,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        outputs = pipe(**inputs)
+        image = outputs.images[0]
+        image = np.array(image)
+        image_slice = image[-3:, -3:, -1]
+        expected_slice = np.array(
+            [0.42007083, 0.44642246, 0.44746736, 0.4038852, 0.560547, 0.5513845, 0.5325784, 0.5170926, 0.46997207]
+        )
+
+        assert image.shape == (32, 32, 3)
+        assert np.allclose(image_slice.flatten(), expected_slice)
+
+    def test_depth2img_pipeline_batch(self):
+        components = self.get_dummy_components()
+        gaudi_config = GaudiConfig(use_torch_autocast=False)
+
+        pipe = GaudiStableDiffusionDepth2ImgPipeline(
+            use_habana=True,
+            gaudi_config=gaudi_config,
+            **components,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        outputs = pipe(
+            prompt=["A painting of a squirrel eating a burger", "A painting of a squirrel eating a burger"],
+            image=self.get_dummy_image(),
+            generator=torch.Generator("cpu").manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        )
+        images = outputs.images
+
+        assert len(images) == 2
+        assert images[-1].shape == (32, 32, 3)
+
+    def test_depth2img_pipeline_bf16(self):
+        components = self.get_dummy_components()
+        gaudi_config = GaudiConfig(use_torch_autocast=True)
+
+        pipe = GaudiStableDiffusionDepth2ImgPipeline(
+            use_habana=True,
+            gaudi_config=gaudi_config,
+            **components,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        outputs = pipe(
+            prompt="A painting of a squirrel eating a burger",
+            image=self.get_dummy_image(),
+            generator=torch.Generator("cpu").manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        )
+        images = outputs.images
+
+        assert len(images) == 1
+        assert images[0].shape == (32, 32, 3)
+
+    def test_depth2img_pipeline_hpu_graphs(self):
+        components = self.get_dummy_components()
+        gaudi_config = GaudiConfig(use_torch_autocast=False)
+
+        pipe = GaudiStableDiffusionDepth2ImgPipeline(
+            use_habana=True,
+            use_hpu_graphs=True,
+            gaudi_config=gaudi_config,
+            **components,
+        )
+        pipe.set_progress_bar_config(disable=None)
+
+        outputs = pipe(
+            prompt="A painting of a squirrel eating a burger",
+            image=self.get_dummy_image(),
+            generator=torch.Generator("cpu").manual_seed(0),
+            num_inference_steps=2,
+            output_type="np",
+        )
+        images = outputs.images
+
+        assert len(images) == 1
+        assert images[0].shape == (32, 32, 3)
+
+    @slow
+    def test_depth2img_pipeline_latency_bf16(self):
+        gaudi_config = GaudiConfig(use_torch_autocast=True)
+        model_name = "stabilityai/stable-diffusion-2-depth"
+        scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
+
+        pipe = GaudiStableDiffusionDepth2ImgPipeline.from_pretrained(
+            model_name, gaudi_config=gaudi_config, scheduler=scheduler, use_habana=True, use_hpu_graphs=True
+        )
+        image = Image.open(
+            requests.get(
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png",
+                stream=True,
+            ).raw
+        )
+        prompt = "A fancy meal with soup and pancakes"
+
+        start_time = time.time()
+        outputs = pipe(
+            prompt=prompt,
+            image=image,
+            generator=torch.Generator("cpu").manual_seed(0),
+            num_inference_steps=50,
+            output_type="np",
+        )
+        end_time = time.time()
+        latency = end_time - start_time
+        images = outputs.images
+        clip_score = calculate_clip_score(np.expand_dims(image, axis=0), [prompt])
+        target_score = 22.76
+
+        assert len(images) == 1
+        assert images[0].shape == (512, 512, 3)
+        assert clip_score > target_score
+
+        assert latency < 1.05 * DEPTH2IMG_GENERATION_LATENCY_BASELINE_BF16
+
+
 class TrainTextToImage(TestCase):
     """
     Tests the Stable Diffusion text_to_image Training for Gaudi.

From d2f013b859ef5f3a675cd876cd486a3709239112 Mon Sep 17 00:00:00 2001
From: Vasudha <106226674+Vasud-ha@users.noreply.github.com>
Date: Thu, 5 Sep 2024 18:41:14 +0530
Subject: [PATCH 24/41] Fix eager mode in run_generation by removing graph logs
 (#1231)

---
 examples/text-generation/utils.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index c535acba0a..5898b26671 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -135,10 +135,6 @@ def setup_env(args):
     # TODO: SW-167588 - WA for memory issue in hqt prep_model
     os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE")
 
-    if args.global_rank == 0 and not args.torch_compile:
-        os.environ.setdefault("GRAPH_VISUALIZATION", "true")
-        shutil.rmtree(".graph_dumps", ignore_errors=True)
-
     if args.world_size > 0:
         os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0")
         os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true")

From 1cc46c56719ab6f3f212ae15d980d6f244dcc10a Mon Sep 17 00:00:00 2001
From: kaixuanliu <kaixuan.liu@intel.com>
Date: Thu, 5 Sep 2024 22:00:17 +0800
Subject: [PATCH 25/41] Fix bug when running `google/paligemma-3b-mix-224`
 (#1279)

Signed-off-by: kaixuanliu <kaixuan.liu@intel.com>
---
 .../transformers/models/gemma/modeling_gemma.py | 17 +++--------------
 1 file changed, 3 insertions(+), 14 deletions(-)

diff --git a/optimum/habana/transformers/models/gemma/modeling_gemma.py b/optimum/habana/transformers/models/gemma/modeling_gemma.py
index 6c537dfa31..6f40c65eaa 100644
--- a/optimum/habana/transformers/models/gemma/modeling_gemma.py
+++ b/optimum/habana/transformers/models/gemma/modeling_gemma.py
@@ -36,10 +36,6 @@
 )
 from transformers.utils import logging
 
-from ...modeling_attn_mask_utils import (
-    _gaudi_prepare_4d_causal_attention_mask,
-)
-
 
 logger = logging.get_logger(__name__)
 
@@ -97,6 +93,7 @@ def gaudi_gemma_attention_forward(
     attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scaling
 
     if attention_mask is not None:  # no matter the length, we just slice it
+        attention_mask = attention_mask[:, :, :, : key_states.shape[-2]]
         attn_weights = attn_weights + attention_mask
 
     # upcast attention to fp32
@@ -196,7 +193,6 @@ def gaudi_gemma_model_forward(
     Copied from GemmaModel.forward: https://github.com/huggingface/transformers/blob/v4.38.1/src/transformers/models/gemma/modeling_gemma.py
     The only differences are:
     - add new args token_idx
-    - replace _update_causal_mask with _gaudi_prepare_4d_causal_attention_mask
     """
 
     output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
@@ -210,12 +206,6 @@ def gaudi_gemma_model_forward(
         raise ValueError(
             "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
         )
-    elif input_ids is not None:
-        batch_size, seq_length = input_ids.shape[:2]
-    elif inputs_embeds is not None:
-        batch_size, seq_length = inputs_embeds.shape[:2]
-    else:
-        raise ValueError("You have to specify either input_ids or inputs_embeds")
 
     if self.gradient_checkpointing and self.training and use_cache:
         logger.warning_once("`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.")
@@ -239,9 +229,8 @@ def gaudi_gemma_model_forward(
     if position_ids is None:
         position_ids = cache_position.unsqueeze(0)
 
-    # 4d mask is passed through the layers, not use self._update_causal_mask
-    causal_mask = _gaudi_prepare_4d_causal_attention_mask(
-        attention_mask, (batch_size, seq_length), inputs_embeds, past_seen_tokens
+    causal_mask = self._update_causal_mask(
+        attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
     )
 
     # embed positions

From 979f04ace72fc1baae30e41456536ecbfa5247e7 Mon Sep 17 00:00:00 2001
From: Xinyu Chen <xinyu1.chen@intel.com>
Date: Thu, 5 Sep 2024 22:26:02 +0800
Subject: [PATCH 26/41] Use native checkpointing under compile mode (#1313)

---
 optimum/habana/transformers/trainer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/optimum/habana/transformers/trainer.py b/optimum/habana/transformers/trainer.py
index 5c418e66b7..d375b32df0 100644
--- a/optimum/habana/transformers/trainer.py
+++ b/optimum/habana/transformers/trainer.py
@@ -682,7 +682,7 @@ def _inner_training_loop(
 
             import transformers.modeling_utils
 
-            if args.deepspeed:
+            if args.deepspeed and args.use_lazy_mode:
                 from deepspeed.runtime.activation_checkpointing.checkpointing import CheckpointFunction
 
                 # HACK because outputs should always be tuples

From dfeafa3aa80b131cebbc6f447b83e8fddd91c42a Mon Sep 17 00:00:00 2001
From: Ranjeet Kumar <rankumar@habana.ai>
Date: Thu, 5 Sep 2024 23:05:38 +0530
Subject: [PATCH 27/41] Fixed fused_qkv object AttributeError due to
 'LlamaConfig' (#1203)

Co-authored-by: Vidya Galli <vidya.s.galli@intel.com>

From 65d044e77692f01a299938ec06df0e5a6c4cb232 Mon Sep 17 00:00:00 2001
From: Daniel Huang <daniel1.huang@intel.com>
Date: Thu, 5 Sep 2024 13:39:01 -0700
Subject: [PATCH 28/41] Image to Image Generation Enabling (#1196)

Co-authored-by: Raymond Lau <kwun.fung.lau@intel.com>
---
 .../image_to_image_generation.py              |   4 +-
 optimum/habana/diffusers/__init__.py          |   1 +
 .../pipeline_stable_diffusion_img2img.py      | 716 ++++++++++++++++++
 .../diffusers/schedulers/scheduling_ddim.py   |   2 +-
 tests/test_diffusers.py                       | 139 ++++
 5 files changed, 860 insertions(+), 2 deletions(-)
 create mode 100644 optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py

diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py
index 64fecf4c91..b1ffae2101 100755
--- a/examples/stable-diffusion/image_to_image_generation.py
+++ b/examples/stable-diffusion/image_to_image_generation.py
@@ -57,8 +57,8 @@ def main():
     )
     parser.add_argument(
         "--src_image_path",
-        default=None,
         type=str,
+        required=True,
         help="Path to source image",
     )
     # Pipeline arguments
@@ -230,6 +230,8 @@ def main():
         from optimum.habana.diffusers import GaudiStableDiffusionImageVariationPipeline as Img2ImgPipeline
 
         kwargs["revision"] = "v2.0"
+    else:
+        from optimum.habana.diffusers import GaudiStableDiffusionImg2ImgPipeline as Img2ImgPipeline
 
     if "image-variations" in args.model_name_or_path:
         im = PIL.Image.open(requests.get(args.src_image_path, stream=True).raw)
diff --git a/optimum/habana/diffusers/__init__.py b/optimum/habana/diffusers/__init__.py
index 82f3bae0cb..d3ec347d07 100644
--- a/optimum/habana/diffusers/__init__.py
+++ b/optimum/habana/diffusers/__init__.py
@@ -7,6 +7,7 @@
 from .pipelines.stable_diffusion.pipeline_stable_diffusion_image_variation import (
     GaudiStableDiffusionImageVariationPipeline,
 )
+from .pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import GaudiStableDiffusionImg2ImgPipeline
 from .pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint import GaudiStableDiffusionInpaintPipeline
 from .pipelines.stable_diffusion.pipeline_stable_diffusion_instruct_pix2pix import (
     GaudiStableDiffusionInstructPix2PixPipeline,
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
new file mode 100644
index 0000000000..d7e36e983a
--- /dev/null
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py
@@ -0,0 +1,716 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import inspect
+import time
+from math import ceil
+from typing import Any, Callable, Dict, List, Optional, Union
+
+import numpy as np
+import PIL.Image
+import torch
+from diffusers.image_processor import PipelineImageInput
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput, StableDiffusionSafetyChecker
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img import (
+    StableDiffusionImg2ImgPipeline,
+    retrieve_latents,
+)
+from diffusers.schedulers import KarrasDiffusionSchedulers
+from diffusers.utils import deprecate
+from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection
+
+from optimum.habana.utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
+from optimum.utils import logging
+
+from ....transformers.gaudi_configuration import GaudiConfig
+from ..pipeline_utils import GaudiDiffusionPipeline
+
+
+logger = logging.get_logger(__name__)
+
+
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used,
+            `timesteps` must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+                Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default
+                timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps`
+                must be `None`.
+
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device="cpu", **kwargs)
+        timesteps = scheduler.timesteps.to(device)
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device="cpu", **kwargs)
+        timesteps = scheduler.timesteps.to(device)
+
+    # Handles the case where the scheduler cannot implement reset_timestep_dependent_params()
+    # Example: UniPCMultiStepScheduler used for inference in ControlNet training as it has non-linear accesses to timestep dependent parameter: sigma.
+    if hasattr(scheduler, "reset_timestep_dependent_params") and callable(scheduler.reset_timestep_dependent_params):
+        scheduler.reset_timestep_dependent_params()
+    return timesteps, num_inference_steps
+
+
+class GaudiStableDiffusionImg2ImgPipeline(GaudiDiffusionPipeline, StableDiffusionImg2ImgPipeline):
+    """
+    Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_img2img.py#L161
+    Changes:
+        1. Use CPU to generate random tensor
+    """
+
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: KarrasDiffusionSchedulers,
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPImageProcessor,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        requires_safety_checker: bool = True,
+        use_habana: bool = False,
+        use_hpu_graphs: bool = False,
+        gaudi_config: Union[str, GaudiConfig] = None,
+        bf16_full_eval: bool = False,
+    ):
+        GaudiDiffusionPipeline.__init__(
+            self,
+            use_habana,
+            use_hpu_graphs,
+            gaudi_config,
+            bf16_full_eval,
+        )
+
+        StableDiffusionImg2ImgPipeline.__init__(
+            self,
+            vae,
+            text_encoder,
+            tokenizer,
+            unet,
+            scheduler,
+            safety_checker,
+            feature_extractor,
+            image_encoder,
+            requires_safety_checker,
+        )
+        self.to(self._device)
+
+    # Copied from ./pipeline_stable_diffusion.py
+    @classmethod
+    def _split_inputs_into_batches(cls, batch_size, latents, prompt_embeds, negative_prompt_embeds):
+        # Use torch.split to generate num_batches batches of size batch_size
+        latents_batches = list(torch.split(latents, batch_size))
+        prompt_embeds_batches = list(torch.split(prompt_embeds, batch_size))
+        if negative_prompt_embeds is not None:
+            negative_prompt_embeds_batches = list(torch.split(negative_prompt_embeds, batch_size))
+
+        # If the last batch has less samples than batch_size, pad it with dummy samples
+        num_dummy_samples = 0
+        if latents_batches[-1].shape[0] < batch_size:
+            num_dummy_samples = batch_size - latents_batches[-1].shape[0]
+            # Pad latents_batches
+            sequence_to_stack = (latents_batches[-1],) + tuple(
+                torch.zeros_like(latents_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            latents_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad prompt_embeds_batches
+            sequence_to_stack = (prompt_embeds_batches[-1],) + tuple(
+                torch.zeros_like(prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+            )
+            prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+            # Pad negative_prompt_embeds_batches if necessary
+            if negative_prompt_embeds is not None:
+                sequence_to_stack = (negative_prompt_embeds_batches[-1],) + tuple(
+                    torch.zeros_like(negative_prompt_embeds_batches[-1][0][None, :]) for _ in range(num_dummy_samples)
+                )
+                negative_prompt_embeds_batches[-1] = torch.vstack(sequence_to_stack)
+
+        # Stack batches in the same tensor
+        latents_batches = torch.stack(latents_batches)
+        if negative_prompt_embeds is not None:
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            for i, (negative_prompt_embeds_batch, prompt_embeds_batch) in enumerate(
+                zip(negative_prompt_embeds_batches, prompt_embeds_batches[:])
+            ):
+                prompt_embeds_batches[i] = torch.cat([negative_prompt_embeds_batch, prompt_embeds_batch])
+
+        prompt_embeds_batches = torch.stack(prompt_embeds_batches)
+
+        return latents_batches, prompt_embeds_batches, num_dummy_samples
+
+    def prepare_latents(self, image, timestep, num_prompts, num_images_per_prompt, dtype, device, generator=None):
+        if not isinstance(image, (torch.Tensor, PIL.Image.Image, list)):
+            raise ValueError(
+                f"`image` has to be of type `torch.Tensor`, `PIL.Image.Image` or list but is {type(image)}"
+            )
+
+        image = image.to(device=device, dtype=dtype)
+
+        batch_size = num_prompts * num_images_per_prompt
+
+        if image.shape[1] == 4:
+            init_latents = image
+
+        else:
+            if isinstance(generator, list) and len(generator) != batch_size:
+                raise ValueError(
+                    f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                    f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+                )
+
+            elif isinstance(generator, list):
+                init_latents = [
+                    retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i])
+                    for i in range(batch_size)
+                ]
+                init_latents = torch.cat(init_latents, dim=0)
+            else:
+                init_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+
+            init_latents = self.vae.config.scaling_factor * init_latents
+
+        if batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] == 0:
+            # expand init_latents for batch_size
+            deprecation_message = (
+                f"You have passed {batch_size} text prompts (`prompt`), but only {init_latents.shape[0]} initial"
+                " images (`image`). Initial images are now duplicating to match the number of text prompts. Note"
+                " that this behavior is deprecated and will be removed in a version 1.0.0. Please make sure to update"
+                " your script to pass as many initial images as text prompts to suppress this warning."
+            )
+            deprecate("len(prompt) != len(image)", "1.0.0", deprecation_message, standard_warn=False)
+            additional_image_per_prompt = batch_size // init_latents.shape[0]
+            init_latents = torch.cat([init_latents] * additional_image_per_prompt, dim=0)
+        elif batch_size > init_latents.shape[0] and batch_size % init_latents.shape[0] != 0:
+            raise ValueError(
+                f"Cannot duplicate `image` of batch size {init_latents.shape[0]} to {batch_size} text prompts."
+            )
+        else:
+            init_latents = torch.cat([init_latents], dim=0)
+
+        # Reuse first generator for noise
+        if isinstance(generator, list):
+            generator = generator[0]
+
+        shape = init_latents.shape
+        rand_device = "cpu" if device.type == "hpu" else device
+        noise = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype)  # HPU Patch
+        noise = noise.to(device)  # HPU Patch
+
+        # get latents
+        init_latents = self.scheduler.add_noise(init_latents, noise, timestep)
+        latents = init_latents
+
+        return latents
+
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: PipelineImageInput = None,
+        strength: float = 0.8,
+        num_inference_steps: Optional[int] = 50,
+        timesteps: List[int] = None,
+        guidance_scale: Optional[float] = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        batch_size: int = 1,
+        eta: Optional[float] = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        ip_adapter_image: Optional[PipelineImageInput] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+        clip_skip: int = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        profiling_warmup_steps: Optional[int] = 0,
+        profiling_steps: Optional[int] = 0,
+        **kwargs,
+    ):
+        r"""
+        The call function to the pipeline for generation.
+
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`.
+            image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, or `List[np.ndarray]`):
+                `Image`, numpy array or tensor representing an image batch to be used as the starting point. For both
+                numpy array and pytorch tensor, the expected value range is between `[0, 1]` If it's a tensor or a list
+                or tensors, the expected shape should be `(B, C, H, W)` or `(C, H, W)`. If it is a numpy array or a
+                list of arrays, the expected shape should be `(B, H, W, C)` or `(H, W, C)` It can also accept image
+                latents as `image`, but if passing latents directly it is not encoded again.
+            strength (`float`, *optional*, defaults to 0.8):
+                Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a
+                starting point and more noise is added the higher the `strength`. The number of denoising steps depends
+                on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising
+                process runs for the full number of iterations specified in `num_inference_steps`. A value of 1
+                essentially ignores `image`.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference. This parameter is modulated by `strength`.
+            timesteps (`List[int]`, *optional*):
+                Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument
+                in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is
+                passed will be used. Must be in descending order.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                A higher guidance scale value encourages the model to generate images closely linked to the text
+                `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide what to not include in image generation. If not defined, you need to
+                pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            batch_size (`int`, *optional*, defaults to 1):
+                The number of images in a batch.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies
+                to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make
+                generation deterministic.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not
+                provided, text embeddings are generated from the `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If
+                not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument.
+            ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generated image. Choose between `PIL.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            cross_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in
+                [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            clip_skip (`int`, *optional*):
+                Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that
+                the output of the pre-final layer will be used for computing the prompt embeddings.
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            profiling_warmup_steps (`int`, *optional*):
+                Number of steps to ignore for profling.
+            profiling_steps (`int`, *optional*):
+                Number of steps to be captured when enabling profiling.
+        Examples:
+
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+                If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned,
+                otherwise a `tuple` is returned where the first element is a list with the generated images and the
+                second element is a list of `bool`s indicating whether the corresponding generated image contains
+                "not-safe-for-work" (nsfw) content.
+        """
+
+        callback = kwargs.pop("callback", None)
+        callback_steps = kwargs.pop("callback_steps", None)
+
+        if callback is not None:
+            deprecate(
+                "callback",
+                "1.0.0",
+                "Passing `callback` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+        if callback_steps is not None:
+            deprecate(
+                "callback_steps",
+                "1.0.0",
+                "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider use `callback_on_step_end`",
+            )
+
+        with torch.autocast(device_type="hpu", dtype=torch.bfloat16, enabled=self.gaudi_config.use_torch_autocast):
+            # 1. Check inputs. Raise error if not correct
+            self.check_inputs(
+                prompt,
+                strength,
+                callback_steps,
+                negative_prompt,
+                prompt_embeds,
+                negative_prompt_embeds,
+                callback_on_step_end_tensor_inputs,
+            )
+
+            self._guidance_scale = guidance_scale
+            self._clip_skip = clip_skip
+            self._cross_attention_kwargs = cross_attention_kwargs
+            self._interrupt = False
+
+            # 2. Define call parameters
+            if prompt is not None and isinstance(prompt, str):
+                num_prompts = 1
+            elif prompt is not None and isinstance(prompt, list):
+                num_prompts = len(prompt)
+            else:
+                num_prompts = prompt_embeds.shape[0]
+            num_batches = ceil((num_images_per_prompt * num_prompts) / batch_size)
+            logger.info(
+                f"{num_prompts} prompt(s) received, {num_images_per_prompt} generation(s) per prompt,"
+                f" {batch_size} sample(s) per batch, {num_batches} total batch(es)."
+            )
+            if num_batches < 3:
+                logger.warning("The first two iterations are slower so it is recommended to feed more batches.")
+            device = self._execution_device
+
+            # 3. Encode input prompt
+            text_encoder_lora_scale = (
+                self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None
+            )
+            prompt_embeds, negative_prompt_embeds = self.encode_prompt(
+                prompt,
+                device,
+                num_images_per_prompt,
+                self.do_classifier_free_guidance,
+                negative_prompt,
+                prompt_embeds=prompt_embeds,
+                negative_prompt_embeds=negative_prompt_embeds,
+                lora_scale=text_encoder_lora_scale,
+                clip_skip=self.clip_skip,
+            )
+
+            if ip_adapter_image is not None:
+                image_embeds = self.prepare_ip_adapter_image_embeds(
+                    ip_adapter_image, device, batch_size * num_images_per_prompt
+                )
+
+            # 4. Preprocess image
+            image = self.image_processor.preprocess(image)
+
+            # 5. set timesteps
+            timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps)
+            latent_timestep = timesteps[:1].repeat(num_prompts * num_images_per_prompt)
+
+            # 6. Prepare latent variables
+            latents = self.prepare_latents(
+                image,
+                latent_timestep,
+                num_prompts,
+                num_images_per_prompt,
+                prompt_embeds.dtype,
+                device,
+                generator,
+            )
+
+            # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline
+            extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+
+            # 7.1 Add image embeds for IP-Adapter
+            added_cond_kwargs = {"image_embeds": image_embeds} if ip_adapter_image is not None else None
+
+            # 7.2 Optionally get Guidance Scale Embedding
+            timestep_cond = None
+            if self.unet.config.time_cond_proj_dim is not None:
+                guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(
+                    batch_size * num_images_per_prompt
+                )
+                timestep_cond = self.get_guidance_scale_embedding(
+                    guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
+                ).to(device=device, dtype=latents.dtype)
+
+            # 8. Split into batches (HPU-specific step)
+            latents_batches, text_embeddings_batches, num_dummy_samples = self._split_inputs_into_batches(
+                batch_size,
+                latents,
+                prompt_embeds,
+                negative_prompt_embeds,
+            )
+
+            outputs = {
+                "images": [],
+                "has_nsfw_concept": [],
+            }
+
+            t0 = time.time()
+            t1 = t0
+
+            hb_profiler = HabanaProfile(
+                warmup=profiling_warmup_steps,
+                active=profiling_steps,
+                record_shapes=False,
+            )
+            hb_profiler.start()
+
+            # 9. Denoising loop
+            throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
+            use_warmup_inference_steps = num_batches < throughput_warmup_steps < num_inference_steps
+            num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+            self._num_timesteps = len(timesteps)
+            for j in self.progress_bar(range(num_batches)):
+                # The throughput is calculated from the 3rd iteration
+                # because compilation occurs in the first two iterations
+                if j == throughput_warmup_steps:
+                    t1 = time.time()
+                if use_warmup_inference_steps:
+                    t0_inf = time.time()
+
+                latents_batch = latents_batches[0]
+                latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
+                text_embeddings_batch = text_embeddings_batches[0]
+                text_embeddings_batches = torch.roll(text_embeddings_batches, shifts=-1, dims=0)
+
+                for i in range(num_inference_steps):  # HPU Patch
+                    if use_warmup_inference_steps and i == throughput_warmup_steps:
+                        t1_inf = time.time()
+                        t1 += t1_inf - t0_inf
+
+                    t = timesteps[0]  # HPU Patch
+                    timesteps = torch.roll(timesteps, shifts=-1, dims=0)  # HPU Patch
+
+                    if self.interrupt:
+                        continue
+
+                    # expand the latents if we are doing classifier free guidance
+                    latent_model_input = (
+                        torch.cat([latents_batch] * 2) if self.do_classifier_free_guidance else latents_batch
+                    )
+                    latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+
+                    # predict the noise residual
+                    noise_pred = self.unet_hpu(
+                        latent_model_input,
+                        t,
+                        encoder_hidden_states=text_embeddings_batch,
+                        timestep_cond=timestep_cond,
+                        cross_attention_kwargs=self.cross_attention_kwargs,
+                        added_cond_kwargs=added_cond_kwargs,
+                    )
+
+                    # perform guidance
+                    if self.do_classifier_free_guidance:
+                        noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                        noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond)
+
+                    # compute the previous noisy sample x_t -> x_t-1
+                    latents_batch = self.scheduler.step(
+                        noise_pred, t, latents_batch, **extra_step_kwargs, return_dict=False
+                    )[0]
+
+                    # HPU Patch
+                    if not self.use_hpu_graphs:
+                        self.htcore.mark_step()
+
+                    if callback_on_step_end is not None:
+                        callback_kwargs = {}
+                        for k in callback_on_step_end_tensor_inputs:
+                            callback_kwargs[k] = locals()[k]
+                        callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+
+                        latents_batch = callback_outputs.pop("latents", latents_batch)
+                        text_embeddings_batch = callback_outputs.pop("prompt_embeds", text_embeddings_batch)
+                        # negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds)
+
+                    # call the callback, if provided
+                    if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                        if callback is not None and i % callback_steps == 0:
+                            step_idx = i // getattr(self.scheduler, "order", 1)
+                            callback(step_idx, t, latents)
+
+                    hb_profiler.step()
+
+                if use_warmup_inference_steps:
+                    t1 = warmup_inference_steps_time_adjustment(
+                        t1, t1_inf, num_inference_steps, throughput_warmup_steps
+                    )
+
+                if not output_type == "latent":
+                    image = self.vae.decode(
+                        latents_batch / self.vae.config.scaling_factor, return_dict=False, generator=generator
+                    )[0]
+                else:
+                    image = latents_batch
+
+                outputs["images"].append(image)
+
+            hb_profiler.stop()
+
+            speed_metrics_prefix = "generation"
+            speed_measures = speed_metrics(
+                split=speed_metrics_prefix,
+                start_time=t0,
+                num_samples=num_batches * batch_size
+                if t1 == t0 or use_warmup_inference_steps
+                else (num_batches - throughput_warmup_steps) * batch_size,
+                num_steps=num_batches,
+                start_time_after_warmup=t1,
+            )
+            logger.info(f"Speed metrics: {speed_measures}")
+
+            # Remove dummy generations if needed
+            if num_dummy_samples > 0:
+                outputs["images"][-1] = outputs["images"][-1][:-num_dummy_samples]
+
+            # Process generated images
+            for i, image in enumerate(outputs["images"][:]):
+                if i == 0:
+                    outputs["images"].clear()
+
+                if output_type == "latent":
+                    has_nsfw_concept = None
+                else:
+                    image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype)
+
+                if has_nsfw_concept is None:
+                    do_denormalize = [True] * image.shape[0]
+                else:
+                    do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept]
+
+                image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize)
+
+                if output_type == "pil" and isinstance(image, list):
+                    outputs["images"] += image
+                elif output_type in ["np", "numpy"] and isinstance(image, np.ndarray):
+                    if len(outputs["images"]) == 0:
+                        outputs["images"] = image
+                    else:
+                        outputs["images"] = np.concatenate((outputs["images"], image), axis=0)
+                else:
+                    if len(outputs["images"]) == 0:
+                        outputs["images"] = image
+                    else:
+                        outputs["images"] = torch.cat((outputs["images"], image), 0)
+
+                if has_nsfw_concept is not None:
+                    outputs["has_nsfw_concept"] += has_nsfw_concept
+                else:
+                    outputs["has_nsfw_concept"] = None
+
+            # Offload all models
+            self.maybe_free_model_hooks()
+
+            if not return_dict:
+                return (outputs["images"], outputs["has_nsfw_concept"])
+
+            return StableDiffusionPipelineOutput(
+                images=outputs["images"], nsfw_content_detected=outputs["has_nsfw_concept"]
+            )
+
+    @torch.no_grad()
+    def unet_hpu(
+        self,
+        latent_model_input,
+        timestep,
+        encoder_hidden_states,
+        timestep_cond,
+        cross_attention_kwargs,
+        added_cond_kwargs,
+    ):
+        if self.use_hpu_graphs:
+            return self.capture_replay(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states,
+                timestep_cond,
+                cross_attention_kwargs,
+                added_cond_kwargs,
+            )
+        else:
+            return self.unet(
+                latent_model_input,
+                timestep,
+                encoder_hidden_states=encoder_hidden_states,
+                timestep_cond=timestep_cond,
+                cross_attention_kwargs=cross_attention_kwargs,
+                added_cond_kwargs=added_cond_kwargs,
+                return_dict=False,
+            )[0]
+
+    @torch.no_grad()
+    def capture_replay(
+        self,
+        latent_model_input,
+        timestep,
+        encoder_hidden_states,
+        timestep_cond,
+        cross_attention_kwargs,
+        added_cond_kwargs,
+    ):
+        inputs = [
+            latent_model_input,
+            timestep,
+            encoder_hidden_states,
+            timestep_cond,
+            cross_attention_kwargs,
+            added_cond_kwargs,
+            False,
+        ]
+
+        h = self.ht.hpu.graphs.input_hash(inputs)
+        cached = self.cache.get(h)
+
+        if cached is None:
+            # Capture the graph and cache it
+            with self.ht.hpu.stream(self.hpu_stream):
+                graph = self.ht.hpu.HPUGraph()
+                graph.capture_begin()
+                outputs = self.unet(
+                    inputs[0],
+                    inputs[1],
+                    encoder_hidden_states=inputs[2],
+                    timestep_cond=inputs[3],
+                    cross_attention_kwargs=inputs[4],
+                    added_cond_kwargs=inputs[5],
+                    return_dict=inputs[6],
+                )[0]
+                graph.capture_end()
+                graph_inputs = inputs
+                graph_outputs = outputs
+                self.cache[h] = self.ht.hpu.graphs.CachedParams(graph_inputs, graph_outputs, graph)
+            return outputs
+
+        # Replay the cached graph with updated inputs
+        self.ht.hpu.graphs.copy_to(cached.graph_inputs, inputs)
+        cached.graph.replay()
+        self.ht.core.hpu.default_stream().synchronize()
+
+        return cached.graph_outputs
diff --git a/optimum/habana/diffusers/schedulers/scheduling_ddim.py b/optimum/habana/diffusers/schedulers/scheduling_ddim.py
index 9577b876ad..c1d521fdda 100644
--- a/optimum/habana/diffusers/schedulers/scheduling_ddim.py
+++ b/optimum/habana/diffusers/schedulers/scheduling_ddim.py
@@ -184,7 +184,7 @@ def _get_variance(self, alpha_prod_t, alpha_prod_t_prev):
 
         variance = (beta_prod_t_prev / beta_prod_t) * (1 - alpha_prod_t / alpha_prod_t_prev)
 
-        return variance
+        return torch.relu(variance)  # Negative variance bug fix
 
     def step(
         self,
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 8650b0e651..e6c814efe8 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -98,6 +98,7 @@
     GaudiStableDiffusionControlNetPipeline,
     GaudiStableDiffusionDepth2ImgPipeline,
     GaudiStableDiffusionImageVariationPipeline,
+    GaudiStableDiffusionImg2ImgPipeline,
     GaudiStableDiffusionInpaintPipeline,
     GaudiStableDiffusionInstructPix2PixPipeline,
     GaudiStableDiffusionLDM3DPipeline,
@@ -2924,6 +2925,144 @@ def test_stable_diffusion_pix2pix_euler(self):
         self.assertLess(np.abs(image_slice.flatten() - expected_slice).max(), 1e-3)
 
 
+class GaudiStableDiffusionImg2ImgPipelineTests(TestCase):
+    """
+    Tests the class StableDiffusionImg2ImgPipeline for Gaudi.
+    Adapted from: https://github.com/huggingface/diffusers/blob/v0.26.3/tests/pipelines/stable_diffusion/test_stable_diffusion_img2img.py
+    """
+
+    def get_dummy_components(self, time_cond_proj_dim=None):
+        torch.manual_seed(0)
+        unet = UNet2DConditionModel(
+            block_out_channels=(32, 64),
+            layers_per_block=2,
+            time_cond_proj_dim=time_cond_proj_dim,
+            sample_size=32,
+            in_channels=4,
+            out_channels=4,
+            down_block_types=("DownBlock2D", "CrossAttnDownBlock2D"),
+            up_block_types=("CrossAttnUpBlock2D", "UpBlock2D"),
+            cross_attention_dim=32,
+        )
+        scheduler = PNDMScheduler(skip_prk_steps=True, steps_offset=1)
+        torch.manual_seed(0)
+        vae = AutoencoderKL(
+            block_out_channels=[32, 64],
+            in_channels=3,
+            out_channels=3,
+            down_block_types=["DownEncoderBlock2D", "DownEncoderBlock2D"],
+            up_block_types=["UpDecoderBlock2D", "UpDecoderBlock2D"],
+            latent_channels=4,
+        )
+        torch.manual_seed(0)
+        text_encoder_config = CLIPTextConfig(
+            bos_token_id=0,
+            eos_token_id=2,
+            hidden_size=32,
+            intermediate_size=37,
+            layer_norm_eps=1e-05,
+            num_attention_heads=4,
+            num_hidden_layers=5,
+            pad_token_id=1,
+            vocab_size=1000,
+        )
+        text_encoder = CLIPTextModel(text_encoder_config)
+        tokenizer = CLIPTokenizer.from_pretrained("hf-internal-testing/tiny-random-clip")
+
+        components = {
+            "unet": unet,
+            "scheduler": scheduler,
+            "vae": vae,
+            "text_encoder": text_encoder,
+            "tokenizer": tokenizer,
+            "safety_checker": None,
+            "feature_extractor": None,
+            "image_encoder": None,
+            "use_habana": True,
+            "use_hpu_graphs": True,
+            "gaudi_config": GaudiConfig(use_torch_autocast=False),
+        }
+        return components
+
+    def get_dummy_tiny_autoencoder(self):
+        return AutoencoderTiny(in_channels=3, out_channels=3, latent_channels=4)
+
+    def get_dummy_inputs(self, device, seed=0):
+        image = floats_tensor((1, 3, 32, 32), rng=random.Random(seed)).to(device)
+        image = image / 2 + 0.5
+        if str(device).startswith("mps"):
+            generator = torch.manual_seed(seed)
+        else:
+            generator = torch.Generator(device=device).manual_seed(seed)
+        inputs = {
+            "prompt": "A painting of a squirrel eating a burger",
+            "image": image,
+            "generator": generator,
+            "num_inference_steps": 2,
+            "guidance_scale": 6.0,
+            "output_type": "np",
+        }
+        return inputs
+
+    def test_stable_diffusion_img2img_default_case(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = GaudiStableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        image = sd_pipe(**inputs).images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array(
+            [0.50006074, 0.49048987, 0.51323986, 0.5654023, 0.5470734, 0.6720333, 0.6559875, 0.5050407, 0.5401596]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_negative_prompt(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = GaudiStableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        negative_prompt = "french fries"
+        output = sd_pipe(**inputs, negative_prompt=negative_prompt)
+        image = output.images
+        image_slice = image[0, -3:, -3:, -1]
+
+        assert image.shape == (1, 32, 32, 3)
+        expected_slice = np.array(
+            [0.5165765, 0.49377573, 0.5040854, 0.5882658, 0.574415, 0.67791325, 0.66678274, 0.51392066, 0.544225]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+    def test_stable_diffusion_img2img_multiple_init_images(self):
+        device = "cpu"  # ensure determinism for the device-dependent torch.Generator
+        components = self.get_dummy_components()
+        sd_pipe = GaudiStableDiffusionImg2ImgPipeline(**components)
+        sd_pipe = sd_pipe.to(device)
+        sd_pipe.set_progress_bar_config(disable=None)
+
+        inputs = self.get_dummy_inputs(device)
+        inputs["prompt"] = [inputs["prompt"]] * 2
+        inputs["image"] = inputs["image"].repeat(2, 1, 1, 1)
+        image = sd_pipe(**inputs).images
+        image_slice = image[-1, -3:, -3:, -1]
+
+        assert image.shape == (2, 32, 32, 3)
+        expected_slice = np.array(
+            [0.3323526, 0.44501957, 0.51663095, 0.32356155, 0.40758416, 0.6448872, 0.44775, 0.5695873, 0.5541928]
+        )
+
+        assert np.abs(image_slice.flatten() - expected_slice).max() < 1e-3
+
+
 class GaudiStableDiffusionImageVariationPipelineTests(TestCase):
     """
     Tests the class StableDiffusionImageVariationPipeline for Gaudi.

From 2270eef4252de14a16ce640afb505634eb24a96f Mon Sep 17 00:00:00 2001
From: Iman Gohari <s.m.iman.gohari@intel.com>
Date: Fri, 6 Sep 2024 05:45:04 -0700
Subject: [PATCH 29/41] Diffusers timing (#1277)

---
 .../controlnet/pipeline_controlnet.py         | 21 ++++++++++++++++---
 .../pipeline_stable_diffusion.py              |  4 ++--
 ...peline_stable_diffusion_image_variation.py |  4 ++--
 .../pipeline_stable_diffusion_inpaint.py      |  4 ++--
 ...eline_stable_diffusion_instruct_pix2pix.py |  5 +++--
 .../pipeline_stable_diffusion_ldm3d.py        |  4 ++--
 .../pipeline_stable_diffusion_upscale.py      |  4 ++--
 .../pipeline_stable_diffusion_xl.py           |  4 ++--
 .../pipeline_stable_diffusion_xl_img2img.py   |  4 ++--
 .../pipeline_stable_diffusion_xl_inpaint.py   |  4 ++--
 .../pipeline_stable_video_diffusion.py        |  4 ++--
 tests/test_diffusers.py                       |  4 ++++
 12 files changed, 43 insertions(+), 23 deletions(-)

diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
index 121a752cae..bb1f6ba354 100644
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -31,7 +31,7 @@
 from optimum.utils import logging
 
 from ....transformers.gaudi_configuration import GaudiConfig
-from ....utils import HabanaProfile, speed_metrics
+from ....utils import HabanaProfile, speed_metrics, warmup_inference_steps_time_adjustment
 from ..pipeline_utils import GaudiDiffusionPipeline
 from ..stable_diffusion.pipeline_stable_diffusion import (
     GaudiStableDiffusionPipeline,
@@ -497,11 +497,17 @@ def __call__(
 
             # 8. Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
+            use_warmup_inference_steps = (
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+            )
+
             for j in self.progress_bar(range(num_batches)):
                 # The throughput is calculated from the 3rd iteration
                 # because compilation occurs in the first two iterations
                 if j == throughput_warmup_steps:
                     t1 = time.time()
+                if use_warmup_inference_steps:
+                    t0_inf = time.time()
 
                 latents_batch = latents_batches[0]
                 latents_batches = torch.roll(latents_batches, shifts=-1, dims=0)
@@ -510,6 +516,10 @@ def __call__(
                 num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
 
                 for i in range(num_inference_steps):
+                    if use_warmup_inference_steps and i == throughput_warmup_steps:
+                        t1_inf = time.time()
+                        t1 += t1_inf - t0_inf
+
                     t = timesteps[0]
                     timesteps = torch.roll(timesteps, shifts=-1, dims=0)
 
@@ -598,6 +608,11 @@ def __call__(
 
                     hb_profiler.step()
 
+                if use_warmup_inference_steps:
+                    t1 = warmup_inference_steps_time_adjustment(
+                        t1, t1_inf, num_inference_steps, throughput_warmup_steps
+                    )
+
                 if not output_type == "latent":
                     # 8. Post-processing
                     output_image = self.vae.decode(
@@ -617,9 +632,9 @@ def __call__(
                 split=speed_metrics_prefix,
                 start_time=t0,
                 num_samples=num_batches * batch_size
-                if t1 == t0
+                if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 118ec641ff..49d221a7a4 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -496,7 +496,7 @@ def __call__(
             # 8. Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
 
             for j in self.progress_bar(range(num_batches)):
@@ -600,7 +600,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index 1c5964b3f7..e681d3c0be 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -313,7 +313,7 @@ def __call__(
             t1 = t0
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
             for j in self.progress_bar(range(num_batches)):
                 # The throughput is calculated from the 3rd iteration
@@ -376,7 +376,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index 6b4331c763..ffe80d4a40 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -553,7 +553,7 @@ def __call__(
             num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
 
             self._num_timesteps = len(timesteps)
@@ -715,7 +715,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index f87c59ece4..c2600e4dfa 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -396,7 +396,7 @@ def __call__(
             t1 = t0
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
             for j in self.progress_bar(range(num_batches)):
                 # The throughput is calculated from the 3rd iteration
@@ -473,6 +473,7 @@ def __call__(
                         step_idx = i // getattr(self.scheduler, "order", 1)
                         callback(step_idx, t, latents_batch)
                     hb_profiler.step()
+
                 if use_warmup_inference_steps:
                     t1 = warmup_inference_steps_time_adjustment(
                         t1, t1_inf, num_inference_steps, throughput_warmup_steps
@@ -494,7 +495,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
index b60b6d89fc..9de11b3ae8 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -341,7 +341,7 @@ def __call__(
             # 8. Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
 
             for j in self.progress_bar(range(num_batches)):
@@ -420,7 +420,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 477871eb40..79f80e4b67 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -438,7 +438,7 @@ def __call__(
             # 10. Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
 
             for j in self.progress_bar(range(num_batches)):
@@ -541,7 +541,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index c1a34e77d3..8785dbbd26 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -693,7 +693,7 @@ def __call__(
             # 8.3 Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
 
             for j in self.progress_bar(range(num_batches)):
@@ -823,7 +823,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index f146edb4d5..79fad16655 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -540,7 +540,7 @@ def denoising_value_valid(dnv):
             # 8.3 Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
             for j in self.progress_bar(range(num_batches)):
                 # The throughput is calculated from the 3rd iteration
@@ -672,7 +672,7 @@ def denoising_value_valid(dnv):
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index 52db884123..accb64fd6a 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -752,7 +752,7 @@ def denoising_value_valid(dnv):
             t1 = t0
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
 
             for j in self.progress_bar(range(num_batches)):
@@ -920,7 +920,7 @@ def denoising_value_valid(dnv):
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py b/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
index 3a1c127a5f..25f122c960 100644
--- a/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_video_diffusion/pipeline_stable_video_diffusion.py
@@ -472,7 +472,7 @@ def __call__(
             # 10. Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
-                num_batches < throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
+                num_batches <= throughput_warmup_steps and num_inference_steps > throughput_warmup_steps
             )
             self._num_timesteps = len(timesteps)
             for j in self.progress_bar(range(num_batches)):
@@ -552,7 +552,7 @@ def __call__(
                 num_samples=num_batches * batch_size
                 if t1 == t0 or use_warmup_inference_steps
                 else (num_batches - throughput_warmup_steps) * batch_size,
-                num_steps=num_batches,
+                num_steps=num_batches * batch_size * num_inference_steps,
                 start_time_after_warmup=t1,
             )
             logger.info(f"Speed metrics: {speed_measures}")
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index e6c814efe8..35c6493697 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -31,7 +31,9 @@
 from unittest import TestCase, skipIf, skipUnless
 
 import diffusers
+import habana_frameworks.torch.hpu as hthpu
 import numpy as np
+import pytest
 import requests
 import safetensors
 import torch
@@ -810,6 +812,7 @@ def test_no_generation_regression_upscale(self):
         self.assertLess(np.abs(expected_slice - upscaled_image[-3:, -3:, -1].flatten()).max(), 5e-3)
 
     @slow
+    @pytest.mark.skipif(hthpu.is_available() and hthpu.device_count() != 8, reason="system does not have 8 cards")
     def test_textual_inversion(self):
         path_to_script = (
             Path(os.path.dirname(__file__)).parent
@@ -2347,6 +2350,7 @@ def test_train_controlnet_script(self):
         self.assertEqual(return_code, 0)
 
     @slow
+    @pytest.mark.skipif(hthpu.is_available() and hthpu.device_count() != 8, reason="system does not have 8 cards")
     def test_train_controlnet(self):
         with tempfile.TemporaryDirectory() as tmpdir:
             path_to_script = (

From 5d11ab177426cd11bbc1de1181b11d5991eee1f9 Mon Sep 17 00:00:00 2001
From: "Wang, Yi" <yi.a.wang@intel.com>
Date: Fri, 6 Sep 2024 20:47:44 +0800
Subject: [PATCH 30/41] Fix eos issue in finetune/generation (#1253)

Signed-off-by: Wang, Yi A <yi.a.wang@intel.com>
Co-authored-by: Yaser Afshar <yaser.afshar@intel.com>
---
 examples/language-modeling/README.md       | 29 ++++++++++++
 examples/language-modeling/run_lora_clm.py | 53 ++++++++++++++++++----
 2 files changed, 72 insertions(+), 10 deletions(-)

diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index f15af55920..57cac19713 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -535,6 +535,35 @@ python ../gaudi_spawn.py \
     --use_cache False
 ```
 
+- Multi-card finetuning of gemma2 using chat template:
+```bash
+python ../gaudi_spawn.py \
+    --world_size 2 --use_mpi run_lora_clm.py \
+    --model_name_or_path google/gemma-2b-it \
+    --per_device_train_batch_size 16 \
+    --per_device_eval_batch_size 16 \
+    --do_train \
+    --do_eval \
+    --num_train_epochs 15 \
+    --output_dir ./output/2b_2hpu_16bs_15ep \
+    --save_total_limit 1 \
+    --gaudi_config_name Habana/gpt2 \
+    --use_habana \
+    --gradient_checkpointing \
+    --throughput_warmup_steps 3 \
+    --use_lazy_mode \
+    --pipelining_fwd_bwd \
+    --bf16 \
+    --logging_strategy epoch \
+    --evaluation_strategy epoch \
+    --lora_target_modules "q_proj" "o_proj" "k_proj" "v_proj" "gate_proj" "up_proj" "down_proj" \
+    --lora_rank=8 \
+    --lora_alpha=16 \
+    --lora_dropout=0.05 \
+    --dataset_name mamamiya405/finred \
+    --chat_prompt True
+```
+
 - Multi-card finetuning of Falcon-40B:
 ```bash
 LOWER_LIST=ops_bf16.txt python3 ../gaudi_spawn.py \
diff --git a/examples/language-modeling/run_lora_clm.py b/examples/language-modeling/run_lora_clm.py
index 4640bb0dc9..9c70b33985 100644
--- a/examples/language-modeling/run_lora_clm.py
+++ b/examples/language-modeling/run_lora_clm.py
@@ -263,6 +263,10 @@ class DataArguments:
         default=False,
         metadata={"help": "Whether to have a SQL style prompt"},
     )
+    chat_prompt: bool = field(
+        default=False,
+        metadata={"help": "Whether to have a chat style prompt."},
+    )
     save_last_ckpt: bool = field(
         default=True, metadata={"help": "Whether to save checkpoint at the end of the training."}
     )
@@ -401,6 +405,25 @@ def create_prompts(examples):
     return prompts
 
 
+def create_chat_prompts(examples, tokenizer):
+    prompts = {}
+    prompts["source"] = []
+    prompts["target"] = []
+    for example in examples:
+        prompt = [
+            {
+                "role": "user",
+                "content": "Answer the below Query based on the Content given below. #### Query: {instruction} #### Content: {input}".format_map(
+                    example
+                ),
+            },
+        ]
+        source = tokenizer.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True)
+        prompts["source"].append(source)
+        prompts["target"].append(example["output"])
+    return prompts
+
+
 def create_sql_prompts(examples):
     prompts = {}
     prompts["source"] = []
@@ -624,11 +647,12 @@ def main():
                     data_args.output_column_name, "answer" if data_args.sql_prompt else "output"
                 )
 
-            prompts = (
-                create_prompts(raw_datasets[key])
-                if not data_args.sql_prompt
-                else create_sql_prompts(raw_datasets[key])
-            )
+            if data_args.chat_prompt:
+                prompts = create_chat_prompts(raw_datasets[key], tokenizer)
+            elif data_args.sql_prompt:
+                prompts = create_sql_prompts(raw_datasets[key])
+            else:
+                prompts = create_prompts(raw_datasets[key])
             columns_to_be_removed = list(raw_datasets[key].features.keys())
             raw_datasets[key] = raw_datasets[key].add_column("prompt_sources", prompts["source"])
             raw_datasets[key] = raw_datasets[key].add_column("prompt_targets", prompts["target"])
@@ -676,12 +700,15 @@ def main():
     if tokenizer.pad_token_id is None:
         tokenizer.pad_token_id = tokenizer.eos_token_id
 
-    def tokenize(prompt, add_eos_token=True):
+    def tokenize(prompt, add_eos_token=True, add_bos_token=True):
+        add_eos_token_o = tokenizer.add_eos_token
+        add_bos_token_o = tokenizer.add_bos_token
         if not data_args.dataset_concatenation:
-            add_eos_token = False
+            tokenizer.add_eos_token = add_eos_token
             padding = "max_length"
         else:
             padding = False
+        tokenizer.add_bos_token = add_bos_token
         results = tokenizer(
             prompt,
             truncation=True,
@@ -689,6 +716,9 @@ def tokenize(prompt, add_eos_token=True):
             padding=padding,
             return_tensors=None,
         )
+        # restore original value
+        tokenizer.add_eos_token = add_eos_token_o
+        tokenizer.add_bos_token = add_bos_token_o
         for i in range(len(results["input_ids"])):
             if (
                 results["input_ids"][i][-1] != tokenizer.eos_token_id
@@ -708,12 +738,12 @@ def preprocess_function(examples):
             raise ValueError(f"Unsupported dataset format, number of keys {keys} !=2")
 
         st = [s + t for s, t in zip(examples[keys[0]], examples[keys[1]])]
-
-        examples_tokenized = tokenize(st)
+        add_bos_token = False if data_args.chat_prompt else True
+        examples_tokenized = tokenize(st, add_bos_token=add_bos_token)
         input_ids = examples_tokenized["input_ids"]
         labels = examples_tokenized["labels"]
         if not finetune_args.train_on_inputs:
-            sources_tokenized = tokenize(examples[keys[0]], add_eos_token=False)
+            sources_tokenized = tokenize(examples[keys[0]], add_eos_token=False, add_bos_token=add_bos_token)
             for label, source_len in zip(labels, sources_tokenized["input_id_len"]):
                 label[:source_len] = [IGNORE_INDEX] * source_len
         return {
@@ -785,6 +815,9 @@ def compute_metrics(eval_preds):
             # by preprocess_logits_for_metrics but we need to shift the labels
             labels = labels[:, 1:].reshape(-1)
             preds = preds[:, :-1].reshape(-1)
+            mask = labels != -100
+            labels = labels[mask]
+            preds = preds[mask]
             return metric.compute(predictions=preds, references=labels)
 
     # Data collator

From 85704533518c6c1d54a144dc7a739f30a36ec6c2 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Fri, 6 Sep 2024 16:33:43 +0200
Subject: [PATCH 31/41] Update CI, tests and examples (#1315)

---
 README.md                                     |  2 +-
 docs/source/quickstart.mdx                    |  2 +-
 docs/source/tutorials/stable_diffusion.mdx    |  4 ++--
 .../usage_guides/accelerate_inference.mdx     |  2 +-
 examples/stable-diffusion/README.md           | 18 ++++++++---------
 .../image_to_image_generation.py              |  2 +-
 .../text_to_image_generation.py               |  4 ++--
 examples/stable-diffusion/training/README.md  | 20 ++++++++++---------
 .../training/train_controlnet.py              | 10 ++++++++++
 examples/trl/README.md                        |  2 +-
 examples/trl/ddpo.py                          |  2 +-
 .../controlnet/pipeline_controlnet.py         |  2 +-
 .../pipeline_stable_diffusion.py              |  2 +-
 ...peline_stable_diffusion_image_variation.py |  2 +-
 .../pipeline_stable_diffusion_inpaint.py      |  4 ++--
 ...eline_stable_diffusion_instruct_pix2pix.py |  2 +-
 .../pipeline_stable_diffusion_ldm3d.py        |  2 +-
 .../pipeline_stable_diffusion_upscale.py      |  2 +-
 .../baselines/CodeLlama_13b_Instruct_hf.json  |  6 +++---
 tests/baselines/LlamaGuard_7b.json            |  4 ++--
 tests/baselines/Qwen2_7B.json                 |  4 ++--
 ...bert_large_uncased_whole_word_masking.json |  8 ++++----
 tests/baselines/clip_roberta.json             |  4 ++--
 tests/baselines/distilbert_base_uncased.json  | 12 +++++------
 tests/baselines/gpt2.json                     |  6 +++---
 tests/baselines/gpt2_xl.json                  |  6 +++---
 tests/baselines/roberta_large.json            |  8 ++++----
 .../swin_base_patch4_window7_224_in22k.json   |  6 +++---
 tests/baselines/t5_small.json                 | 16 +++++++--------
 tests/baselines/wav2vec2_base.json            | 10 +++++-----
 tests/baselines/whisper_small.json            |  8 ++++----
 tests/test_diffusers.py                       | 17 ++++++++--------
 tests/test_fsdp_examples.py                   |  2 +-
 tests/test_text_generation_example.py         |  4 ++--
 34 files changed, 109 insertions(+), 96 deletions(-)

diff --git a/README.md b/README.md
index c52bd21225..9d13f8bef3 100644
--- a/README.md
+++ b/README.md
@@ -153,7 +153,7 @@ You can generate images from prompts using Stable Diffusion on Intel Gaudi using
 + from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
 
 
-model_name = "runwayml/stable-diffusion-v1-5"
+model_name = "CompVis/stable-diffusion-v1-4"
 
 - scheduler = DDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 + scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
diff --git a/docs/source/quickstart.mdx b/docs/source/quickstart.mdx
index 0690cd99c6..178eef7336 100644
--- a/docs/source/quickstart.mdx
+++ b/docs/source/quickstart.mdx
@@ -62,7 +62,7 @@ Here is how to use it and the differences with the 🤗 Diffusers library:
 + from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
 
 
-model_name = "runwayml/stable-diffusion-v1-5"
+model_name = "CompVis/stable-diffusion-v1-4"
 
 - scheduler = DDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 + scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
diff --git a/docs/source/tutorials/stable_diffusion.mdx b/docs/source/tutorials/stable_diffusion.mdx
index c662005a5f..6354533394 100644
--- a/docs/source/tutorials/stable_diffusion.mdx
+++ b/docs/source/tutorials/stable_diffusion.mdx
@@ -33,7 +33,7 @@ Finally, you will need to specify a [Gaudi configuration](https://huggingface.co
 ```python
 from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
 
-model_name = "runwayml/stable-diffusion-v1-5"
+model_name = "CompVis/stable-diffusion-v1-4"
 
 scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 
@@ -166,7 +166,7 @@ Here is how to do it:
 import torch
 
 pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-    "runwayml/stable-diffusion-v1-5",
+    "CompVis/stable-diffusion-v1-4",
     scheduler=scheduler,
     use_habana=True,
     use_hpu_graphs=True,
diff --git a/docs/source/usage_guides/accelerate_inference.mdx b/docs/source/usage_guides/accelerate_inference.mdx
index be113daf32..1858cf399d 100644
--- a/docs/source/usage_guides/accelerate_inference.mdx
+++ b/docs/source/usage_guides/accelerate_inference.mdx
@@ -76,7 +76,7 @@ trainer = GaudiTrainer(
 ```python
 from optimum.habana.diffusers import GaudiDDIMScheduler, GaudiStableDiffusionPipeline
 
-model_name = "runwayml/stable-diffusion-v1-5"
+model_name = "CompVis/stable-diffusion-v1-4"
 
 scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 
diff --git a/examples/stable-diffusion/README.md b/examples/stable-diffusion/README.md
index 392d79338a..38ca7ae9d7 100644
--- a/examples/stable-diffusion/README.md
+++ b/examples/stable-diffusion/README.md
@@ -28,7 +28,7 @@ Stable Diffusion was proposed in [Stable Diffusion Announcement](https://stabili
 Here is how to generate images with one prompt:
 ```bash
 python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --prompts "An image of a squirrel in Picasso style" \
     --num_images_per_prompt 28 \
     --batch_size 7 \
@@ -49,7 +49,7 @@ python text_to_image_generation.py \
 Here is how to generate images with several prompts:
 ```bash
 python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \
     --num_images_per_prompt 32 \
     --batch_size 8 \
@@ -65,7 +65,7 @@ Here is how to generate images with two prompts on two HPUs:
 ```bash
 python ../gaudi_spawn.py \
     --world_size 2 text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --prompts "An image of a squirrel in Picasso style" "A shiny flying horse taking off" \
     --num_images_per_prompt 20 \
     --batch_size 4 \
@@ -290,7 +290,7 @@ python text_to_image_generation.py \
 
 > For improved performance of the SD3 pipeline on Gaudi, it is recommended to configure the environment
 > by setting PT_HPU_MAX_COMPOUND_OP_SIZE to 1.
- 
+
 ## ControlNet
 
 ControlNet was introduced in [Adding Conditional Control to Text-to-Image Diffusion Models ](https://huggingface.co/papers/2302.05543) by Lvmin Zhang and Maneesh Agrawala.
@@ -300,7 +300,7 @@ Here is how to generate images conditioned by canny edge model:
 ```bash
 pip install -r requirements.txt
 python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
     --prompts "futuristic-looking woman" \
     --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
@@ -317,7 +317,7 @@ Here is how to generate images conditioned by canny edge model and with multiple
 ```bash
 pip install -r requirements.txt
 python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
     --prompts "futuristic-looking woman" "a rusty robot" \
     --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
@@ -335,7 +335,7 @@ Here is how to generate images conditioned by canny edge model and with two prom
 pip install -r requirements.txt
 python ../gaudi_spawn.py \
     --world_size 2 text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --controlnet_model_name_or_path lllyasviel/sd-controlnet-canny \
     --prompts "futuristic-looking woman" "a rusty robot" \
     --control_image https://hf.co/datasets/huggingface/documentation-images/resolve/main/diffusers/input_image_vermeer.png \
@@ -353,7 +353,7 @@ Here is how to generate images conditioned by open pose model:
 ```bash
 pip install -r requirements.txt
 python text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5 \
+    --model_name_or_path CompVis/stable-diffusion-v1-4 \
     --controlnet_model_name_or_path lllyasviel/sd-controlnet-openpose \
     --prompts "Chef in the kitchen" \
     --control_image https://huggingface.co/datasets/hf-internal-testing/diffusers-images/resolve/main/sd_controlnet/pose.png \
@@ -394,7 +394,7 @@ please refer to [Hugging Face Diffusers doc](https://huggingface.co/docs/diffuse
 ### Stable Diffusion Inpainting
 ```bash
 python text_to_image_generation.py \
-    --model_name_or_path  runwayml/stable-diffusion-inpainting \
+    --model_name_or_path  stabilityai/stable-diffusion-2-inpainting \
     --base_image https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png \
     --mask_image https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint_mask.png \
     --prompts "concept art digital painting of an elven castle, inspired by lord of the rings, highly detailed, 8k" \
diff --git a/examples/stable-diffusion/image_to_image_generation.py b/examples/stable-diffusion/image_to_image_generation.py
index b1ffae2101..820af8010f 100755
--- a/examples/stable-diffusion/image_to_image_generation.py
+++ b/examples/stable-diffusion/image_to_image_generation.py
@@ -51,7 +51,7 @@ def main():
 
     parser.add_argument(
         "--model_name_or_path",
-        default="runwayml/stable-diffusion-v1-5",
+        default="CompVis/stable-diffusion-v1-4",
         type=str,
         help="Path to pre-trained model",
     )
diff --git a/examples/stable-diffusion/text_to_image_generation.py b/examples/stable-diffusion/text_to_image_generation.py
index 1fa231f992..8caa659ca6 100755
--- a/examples/stable-diffusion/text_to_image_generation.py
+++ b/examples/stable-diffusion/text_to_image_generation.py
@@ -50,7 +50,7 @@ def main():
 
     parser.add_argument(
         "--model_name_or_path",
-        default="runwayml/stable-diffusion-v1-5",
+        default="CompVis/stable-diffusion-v1-4",
         type=str,
         help="Path to pre-trained model",
     )
@@ -481,7 +481,7 @@ def main():
                 # SD LDM3D use-case
                 from optimum.habana.diffusers import GaudiStableDiffusionLDM3DPipeline as GaudiStableDiffusionPipeline
 
-                if args.model_name_or_path == "runwayml/stable-diffusion-v1-5":
+                if args.model_name_or_path == "CompVis/stable-diffusion-v1-4":
                     args.model_name_or_path = "Intel/ldm3d-4c"
                 pipeline = GaudiStableDiffusionPipeline.from_pretrained(
                     args.model_name_or_path,
diff --git a/examples/stable-diffusion/training/README.md b/examples/stable-diffusion/training/README.md
index d686b30f40..28e2d4e8c0 100644
--- a/examples/stable-diffusion/training/README.md
+++ b/examples/stable-diffusion/training/README.md
@@ -43,7 +43,7 @@ Now we can launch the training using:
 
 ```bash
 python textual_inversion.py \
-  --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
+  --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
   --train_data_dir ./cat \
   --learnable_property object \
   --placeholder_token "<cat-toy>" \
@@ -82,7 +82,7 @@ Then proceed to training with command:
 
 ```bash
 python train_controlnet.py \
- --pretrained_model_name_or_path=runwayml/stable-diffusion-v1-5\
+ --pretrained_model_name_or_path=CompVis/stable-diffusion-v1-4\
  --output_dir=/tmp/stable_diffusion1_5 \
  --dataset_name=fusing/fill50k \
  --resolution=512 \
@@ -92,7 +92,8 @@ python train_controlnet.py \
  --train_batch_size=4 \
  --throughput_warmup_steps=3 \
  --use_hpu_graphs \
- --bf16
+ --bf16 \
+ --trust_remote_code
 ```
 
 ### Multi-card Run
@@ -100,7 +101,7 @@ python train_controlnet.py \
 You can run these fine-tuning scripts in a distributed fashion as follows:
 ```bash
 python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \
-  --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5 \
+  --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4 \
   --output_dir=/tmp/stable_diffusion1_5 \
   --dataset_name=fusing/fill50k \
   --resolution=512 \
@@ -110,7 +111,8 @@ python ../../gaudi_spawn.py --use_mpi --world_size 8 train_controlnet.py \
   --train_batch_size=4 \
   --throughput_warmup_steps 3 \
   --use_hpu_graphs \
-  --bf16
+  --bf16 \
+  --trust_remote_code
 ```
 
 
@@ -124,7 +126,7 @@ from diffusers.utils import load_image
 import torch
 from optimum.habana.diffusers import GaudiStableDiffusionControlNetPipeline
 
-base_model_path = "runwayml/stable-diffusion-v1-5"
+base_model_path = "CompVis/stable-diffusion-v1-4"
 controlnet_path = "/tmp/stable_diffusion1_5"
 
 controlnet = ControlNetModel.from_pretrained(controlnet_path, torch_dtype=torch.bfloat16)
@@ -285,7 +287,7 @@ snapshot_download(
 And launch the multi-card training using:
 ```bash
 
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="out"
@@ -325,7 +327,7 @@ use *1e-4* instead of the usual *5e-6*.___**
 Launch the multi-card training using:
 ```bash
 
-export MODEL_NAME="runwayml/stable-diffusion-v1-5"
+export MODEL_NAME="CompVis/stable-diffusion-v1-4"
 export INSTANCE_DIR="dog"
 export CLASS_DIR="path-to-class-images"
 export OUTPUT_DIR="out"
@@ -369,7 +371,7 @@ You could use text_to_image_generation.py to generate picture using the peft ada
 
 ```bash
 python ../text_to_image_generation.py \
-    --model_name_or_path runwayml/stable-diffusion-v1-5  \
+    --model_name_or_path CompVis/stable-diffusion-v1-4  \
     --prompts "a sks dog" \
     --num_images_per_prompt 5 \
     --batch_size 1 \
diff --git a/examples/stable-diffusion/training/train_controlnet.py b/examples/stable-diffusion/training/train_controlnet.py
index 696b54baca..e676ae6ddf 100644
--- a/examples/stable-diffusion/training/train_controlnet.py
+++ b/examples/stable-diffusion/training/train_controlnet.py
@@ -567,6 +567,15 @@ def parse_args(input_args=None):
         ),
     )
     parser.add_argument("--use_hpu_graphs", action="store_true", help="Use HPU graphs on HPU.")
+    parser.add_argument(
+        "--trust_remote_code",
+        action="store_true",
+        help=(
+            "Whether to trust the execution of code from datasets defined on the Hub."
+            " This option should only be set to `True` for repositories you trust and in which you have read the"
+            " code, as it will execute code present on the Hub on your local machine."
+        ),
+    )
 
     if input_args is not None:
         args = parser.parse_args(input_args)
@@ -620,6 +629,7 @@ def make_train_dataset(args, tokenizer, accelerator):
             args.dataset_name,
             args.dataset_config_name,
             cache_dir=args.cache_dir,
+            trust_remote_code=args.trust_remote_code,
         )
     else:
         if args.train_data_dir is not None:
diff --git a/examples/trl/README.md b/examples/trl/README.md
index 3649a81c13..ceaf1046d4 100644
--- a/examples/trl/README.md
+++ b/examples/trl/README.md
@@ -281,7 +281,7 @@ from optimum.habana import GaudiConfig
 from optimum.habana.trl import GaudiDefaultDDPOStableDiffusionPipeline
 
 gaudi_config = GaudiConfig.from_pretrained("Habana/stable-diffusion")
-model_id = "runwayml/stable-diffusion-v1-5"
+model_id = "CompVis/stable-diffusion-v1-4"
 lora_model_id = "ddpo-finetuned-stable-diffusion"
 pipeline = GaudiDefaultDDPOStableDiffusionPipeline(
     model_id,
diff --git a/examples/trl/ddpo.py b/examples/trl/ddpo.py
index c493c7165d..46caf64c49 100644
--- a/examples/trl/ddpo.py
+++ b/examples/trl/ddpo.py
@@ -57,7 +57,7 @@ class ScriptArguments:
         default=None, metadata={"help": "Hugging Face token. If None, token is retrieved from env or cache."}
     )
     pretrained_model: str = field(
-        default="runwayml/stable-diffusion-v1-5", metadata={"help": "the pretrained model to use"}
+        default="CompVis/stable-diffusion-v1-4", metadata={"help": "the pretrained model to use"}
     )
     pretrained_revision: str = field(default="main", metadata={"help": "the pretrained model revision to use"})
     hf_hub_model_id: str = field(
diff --git a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
index bb1f6ba354..b4566a0241 100644
--- a/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
+++ b/optimum/habana/diffusers/pipelines/controlnet/pipeline_controlnet.py
@@ -68,7 +68,7 @@ class GaudiStableDiffusionControlNetPipeline(GaudiDiffusionPipeline, StableDiffu
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
             about a model's potential harms.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
index 49d221a7a4..f0a7febc5f 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
@@ -117,7 +117,7 @@ class GaudiStableDiffusionPipeline(GaudiDiffusionPipeline, StableDiffusionPipeli
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
             about a model's potential harms.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
index e681d3c0be..b2a419389b 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_image_variation.py
@@ -57,7 +57,7 @@ class GaudiStableDiffusionImageVariationPipeline(GaudiDiffusionPipeline, StableD
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
             about a model's potential harms.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
index ffe80d4a40..2884831732 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_inpaint.py
@@ -68,7 +68,7 @@ class GaudiStableDiffusionInpaintPipeline(GaudiDiffusionPipeline, StableDiffusio
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
             about a model's potential harms.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
@@ -335,7 +335,7 @@ def __call__(
         >>> mask_image = download_image(mask_url).resize((512, 512))
 
         >>> pipe = StableDiffusionInpaintPipeline.from_pretrained(
-        ...     "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16
+        ...     "stabilityai/stable-diffusion-2-inpainting", torch_dtype=torch.float16
         ... )
         >>> pipe = pipe.to("cuda")
 
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
index c2600e4dfa..0f8eb39f92 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_instruct_pix2pix.py
@@ -58,7 +58,7 @@ class GaudiStableDiffusionInstructPix2PixPipeline(GaudiDiffusionPipeline, Stable
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
             about a model's potential harms.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
index 9de11b3ae8..704c3c1cf1 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_ldm3d.py
@@ -70,7 +70,7 @@ class GaudiStableDiffusionLDM3DPipeline(GaudiDiffusionPipeline, StableDiffusionL
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details
+            Please refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for more details
             about a model's potential harms.
         feature_extractor ([`~transformers.CLIPImageProcessor`]):
             A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`.
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
index 79f80e4b67..58f2f977a9 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion_upscale.py
@@ -77,7 +77,7 @@ class GaudiStableDiffusionUpscalePipeline(GaudiDiffusionPipeline, StableDiffusio
             [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
         safety_checker ([`StableDiffusionSafetyChecker`]):
             Classification module that estimates whether generated images could be considered offensive or harmful.
-            Please, refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for details.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
         feature_extractor ([`CLIPImageProcessor`]):
             Model that extracts features from generated images to be used as inputs for the `safety_checker`.
         use_habana (bool, defaults to `False`):
diff --git a/tests/baselines/CodeLlama_13b_Instruct_hf.json b/tests/baselines/CodeLlama_13b_Instruct_hf.json
index 93e77ee21c..c9a781ddea 100644
--- a/tests/baselines/CodeLlama_13b_Instruct_hf.json
+++ b/tests/baselines/CodeLlama_13b_Instruct_hf.json
@@ -7,9 +7,9 @@
                 "deepspeed": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 48,
-                    "train_runtime": 371.0852,
-                    "train_samples_per_second": 19.243,
-                    "perplexity": 6.982,
+                    "train_runtime": 438.536,
+                    "train_samples_per_second": 18.663,
+                    "perplexity": 6.87936780659991,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--gradient_checkpointing",
diff --git a/tests/baselines/LlamaGuard_7b.json b/tests/baselines/LlamaGuard_7b.json
index a94b1988d1..948be4088c 100644
--- a/tests/baselines/LlamaGuard_7b.json
+++ b/tests/baselines/LlamaGuard_7b.json
@@ -7,9 +7,9 @@
                 "deepspeed": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 0.8726,
+                    "eval_f1": 0.8873483535528596,
                     "train_runtime": 55.8644,
-                    "train_samples_per_second": 349.869,
+                    "train_samples_per_second": 342.169,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--add_pad_token True",
diff --git a/tests/baselines/Qwen2_7B.json b/tests/baselines/Qwen2_7B.json
index 844f57b729..d98abf5e7d 100644
--- a/tests/baselines/Qwen2_7B.json
+++ b/tests/baselines/Qwen2_7B.json
@@ -44,8 +44,8 @@
                 "multi_card": {
                     "learning_rate": 3e-4,
                     "train_batch_size": 2,
-                    "train_runtime": 360,
-                    "train_samples_per_second": 8.5,
+                    "train_runtime": 423.995,
+                    "train_samples_per_second": 7.342,
                     "extra_arguments": [
                         "--bf16 True",
                         "--subset ''",
diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/baselines/bert_large_uncased_whole_word_masking.json
index 6b6c4e024e..37948b9746 100755
--- a/tests/baselines/bert_large_uncased_whole_word_masking.json
+++ b/tests/baselines/bert_large_uncased_whole_word_masking.json
@@ -104,9 +104,9 @@
                 "multi_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 40,
-                    "eval_f1": 0.8758,
-                    "train_runtime": 41.4282,
-                    "train_samples_per_second": 2771.405,
+                    "eval_f1": 0.8452579034941764,
+                    "train_runtime": 31.445,
+                    "train_samples_per_second": 2845.068,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
@@ -115,4 +115,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/tests/baselines/clip_roberta.json b/tests/baselines/clip_roberta.json
index 18d80762cc..a712dfb792 100755
--- a/tests/baselines/clip_roberta.json
+++ b/tests/baselines/clip_roberta.json
@@ -35,8 +35,8 @@
                 "multi_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 512,
-                    "train_runtime": 63.36,
-                    "train_samples_per_second": 18434.069,
+                    "train_runtime": 62.3694,
+                    "train_samples_per_second": 16572.31,
                     "extra_arguments": [
                         "--data_dir $PWD/",
                         "--dataset_config_name 2017",
diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/baselines/distilbert_base_uncased.json
index 00482ebeea..8678342e7b 100644
--- a/tests/baselines/distilbert_base_uncased.json
+++ b/tests/baselines/distilbert_base_uncased.json
@@ -37,9 +37,9 @@
                 "single_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 64,
-                    "eval_f1": 84.5418,
-                    "train_runtime": 117.8054,
-                    "train_samples_per_second": 1547.185,
+                    "eval_f1": 84.4002097183518,
+                    "train_runtime": 136.3135,
+                    "train_samples_per_second": 1329.313,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -48,9 +48,9 @@
                 "multi_card": {
                     "learning_rate": 3e-4,
                     "train_batch_size": 64,
-                    "eval_f1": 83.2233,
-                    "train_runtime": 24.0441,
-                    "train_samples_per_second": 11144.651,
+                    "eval_f1": 83.15565271833093,
+                    "train_runtime": 25.9614,
+                    "train_samples_per_second": 9259.038,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/gpt2.json b/tests/baselines/gpt2.json
index 889bdbd3d4..355d691492 100644
--- a/tests/baselines/gpt2.json
+++ b/tests/baselines/gpt2.json
@@ -50,9 +50,9 @@
                 "multi_card": {
                     "learning_rate": 8e-4,
                     "train_batch_size": 16,
-                    "perplexity": 21.7858,
-                    "train_runtime": 23.8993,
-                    "train_samples_per_second": 939.24,
+                    "perplexity": 21.786644821433327,
+                    "train_runtime": 24.8822,
+                    "train_samples_per_second": 886.689,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/gpt2_xl.json b/tests/baselines/gpt2_xl.json
index a44e96f44a..68651d16e3 100644
--- a/tests/baselines/gpt2_xl.json
+++ b/tests/baselines/gpt2_xl.json
@@ -27,9 +27,9 @@
                 "deepspeed": {
                     "learning_rate": 4e-4,
                     "train_batch_size": 16,
-                    "perplexity": 13.23775,
-                    "train_runtime": 190.696,
-                    "train_samples_per_second": 89.877,
+                    "perplexity": 13.237754028004865,
+                    "train_runtime": 206.5775,
+                    "train_samples_per_second": 95.539,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--gradient_checkpointing",
diff --git a/tests/baselines/roberta_large.json b/tests/baselines/roberta_large.json
index 0fc813ab9e..8b9037b32b 100755
--- a/tests/baselines/roberta_large.json
+++ b/tests/baselines/roberta_large.json
@@ -83,9 +83,9 @@
                 "multi_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 16,
-                    "perplexity": 2.829,
-                    "train_runtime": 25.6323,
-                    "train_samples_per_second": 1183.796,
+                    "perplexity": 2.829522488584474,
+                    "train_runtime": 22.7101,
+                    "train_samples_per_second": 1056.875,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
@@ -95,4 +95,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/baselines/swin_base_patch4_window7_224_in22k.json
index b6c09b6dec..84c6d59c9f 100644
--- a/tests/baselines/swin_base_patch4_window7_224_in22k.json
+++ b/tests/baselines/swin_base_patch4_window7_224_in22k.json
@@ -49,9 +49,9 @@
                 "single_card": {
                     "learning_rate": 6e-5,
                     "train_batch_size": 160,
-                    "eval_accuracy": 0.9852,
-                    "train_runtime": 73.5918,
-                    "train_samples_per_second": 957.491,
+                    "eval_accuracy": 0.9850666666666666,
+                    "train_runtime": 72.7832,
+                    "train_samples_per_second": 891.862,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
diff --git a/tests/baselines/t5_small.json b/tests/baselines/t5_small.json
index 731be7e3f5..2b206718e3 100644
--- a/tests/baselines/t5_small.json
+++ b/tests/baselines/t5_small.json
@@ -57,10 +57,10 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 32,
-                    "eval_rougeLsum": 38.5648,
-                    "train_runtime": 164.962,
-                    "train_samples_per_second": 1912.578,
-                    "eval_samples_per_second": 116.48,
+                    "eval_rougeLsum": 38.5977,
+                    "train_runtime": 162.079,
+                    "train_samples_per_second": 1922.144,
+                    "eval_samples_per_second": 96.797,
                     "extra_arguments": [
                         "--dataset_config \"3.0.0\"",
                         "--source_prefix \"summarize: \"",
@@ -80,9 +80,9 @@
                 "multi_card": {
                     "learning_rate": 2e-3,
                     "train_batch_size": 64,
-                    "eval_f1": 65.7157,
-                    "train_runtime": 49.5816,
-                    "train_samples_per_second": 6353.351,
+                    "eval_f1": 65.83485191703365,
+                    "train_runtime": 53.8295,
+                    "train_samples_per_second": 5686.229,
                     "extra_arguments": [
                         "--context_column context",
                         "--question_column question",
@@ -143,4 +143,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/tests/baselines/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json
index b920b27539..b187e02d51 100644
--- a/tests/baselines/wav2vec2_base.json
+++ b/tests/baselines/wav2vec2_base.json
@@ -36,10 +36,10 @@
                 "multi_card": {
                     "learning_rate": 3e-4,
                     "train_batch_size": 32,
-                    "eval_accuracy": 0.7311,
-                    "train_runtime": 149.8893,
-                    "train_samples_per_second": 3048.207,
-                    "eval_samples_per_second": 631.601,
+                    "eval_accuracy": 0.7228,
+                    "train_runtime": 63.4079,
+                    "train_samples_per_second": 2975.844,
+                    "eval_samples_per_second": 3640.021,
                     "extra_arguments": [
                         "--audio_column_name audio",
                         "--label_column_name language",
@@ -57,4 +57,4 @@
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json
index 5b44467f71..4566b48514 100644
--- a/tests/baselines/whisper_small.json
+++ b/tests/baselines/whisper_small.json
@@ -41,10 +41,10 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "eval_wer": 0.8477,
-                    "train_runtime": 287.0947,
-                    "train_samples_per_second": 307.526,
-                    "eval_samples_per_second": 12.069,
+                    "eval_wer": 1.3541597337770384,
+                    "train_runtime": 315.288,
+                    "train_samples_per_second": 276.962,
+                    "eval_samples_per_second": 20.138,
                     "extra_arguments": [
                         "--dataset_config_name hi",
                         "--language hindi",
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index 35c6493697..3015dc21db 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -634,7 +634,7 @@ def test_no_throughput_regression_bf16(self):
         ]
         num_images_per_prompt = 11
         batch_size = 4
-        model_name = "runwayml/stable-diffusion-v1-5"
+        model_name = "CompVis/stable-diffusion-v1-4"
         scheduler = GaudiDDIMScheduler.from_pretrained(model_name, subfolder="scheduler")
 
         pipeline = GaudiStableDiffusionPipeline.from_pretrained(
@@ -834,7 +834,7 @@ def test_textual_inversion(self):
                     "--world_size",
                     "8",
                     f"{path_to_script}",
-                    "--pretrained_model_name_or_path runwayml/stable-diffusion-v1-5",
+                    "--pretrained_model_name_or_path CompVis/stable-diffusion-v1-4",
                     f"--train_data_dir {data_dir}",
                     '--learnable_property "object"',
                     '--placeholder_token "<cat-toy>"',
@@ -2375,7 +2375,7 @@ def test_train_controlnet(self):
                     --use_mpi
                     --world_size 8
                     {path_to_script}
-                    --pretrained_model_name_or_path runwayml/stable-diffusion-v1-5
+                    --pretrained_model_name_or_path CompVis/stable-diffusion-v1-4
                     --dataset_name fusing/fill50k
                     --resolution 512
                     --train_batch_size 4
@@ -2389,6 +2389,7 @@ def test_train_controlnet(self):
                     --bf16
                     --num_train_epochs 1
                     --output_dir {tmpdir}
+                    --trust_remote_code
                 """.split()
 
             # Run train_controlnet.y
@@ -2407,7 +2408,7 @@ def test_train_controlnet(self):
             # Assess generated image
             controlnet = ControlNetModel.from_pretrained(tmpdir, torch_dtype=torch.bfloat16)
             pipe = GaudiStableDiffusionControlNetPipeline.from_pretrained(
-                "runwayml/stable-diffusion-v1-5",
+                "CompVis/stable-diffusion-v1-4",
                 controlnet=controlnet,
                 torch_dtype=torch.bfloat16,
                 use_habana=True,
@@ -3349,7 +3350,7 @@ def test_deterministic_image_generation(self):
             test_args = f"""
                 python3
                 {path_to_script}
-                --model_name_or_path runwayml/stable-diffusion-v1-5
+                --model_name_or_path CompVis/stable-diffusion-v1-4
                 --num_images_per_prompt 20
                 --batch_size 4
                 --image_save_dir /tmp/stable_diffusion_images
@@ -3371,7 +3372,7 @@ def test_deterministic_image_generation(self):
     def test_deterministic_image_generation_no_throughput_regression_bf16(self):
         kwargs = {"timestep_spacing": "linspace"}
         scheduler = GaudiDDIMScheduler.from_pretrained(
-            "runwayml/stable-diffusion-v1-5", **kwargs, subfolder="scheduler"
+            "CompVis/stable-diffusion-v1-4", **kwargs, subfolder="scheduler"
         )
 
         kwargs = {
@@ -3382,7 +3383,7 @@ def test_deterministic_image_generation_no_throughput_regression_bf16(self):
         }
 
         pipeline = GaudiStableDiffusionPipeline.from_pretrained(
-            "runwayml/stable-diffusion-v1-5",
+            "CompVis/stable-diffusion-v1-4",
             **kwargs,
         )
 
@@ -5040,7 +5041,7 @@ def test_stable_diffusion_inpaint_no_throughput_regression(self):
         ]
         num_images_per_prompt = 10
         num_inference_steps = 10
-        model_name = "runwayml/stable-diffusion-inpainting"
+        model_name = "stabilityai/stable-diffusion-2-inpainting"
 
         init_kwargs = {
             "use_habana": True,
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 7d8128b765..ea984a6374 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -17,7 +17,7 @@
             (
                 "bert-base-uncased",
                 "Habana/bert-base-uncased",
-                3516.322,
+                3253.917,
                 85.5503,
                 "question-answering",
                 24,
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index 9c4e983576..d92eb05827 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -38,9 +38,9 @@
             ("codellama/CodeLlama-34b-hf", 1, True, 32.644),
             ("bigcode/starcoder2-3b", 1, False, 261.07213776344133),
             ("adept/persimmon-8b-base", 4, False, 366.73968820698406),
-            ("Qwen/Qwen1.5-7B", 4, False, 518.894516133132),
+            ("Qwen/Qwen1.5-7B", 4, False, 490.8621617893209),
             ("google/gemma-7b", 1, False, 109.70751574382221),
-            ("state-spaces/mamba-130m-hf", 1536, False, 8600),
+            ("state-spaces/mamba-130m-hf", 1536, False, 5385.511100161605),
             ("Deci/DeciLM-7B", 1, False, 120),
         ],
         "fp8": [

From f70d5ad76947c2bb070e361efd371106e5fabe46 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 9 Sep 2024 10:18:44 +0000
Subject: [PATCH 32/41] Update Gaudi2 CI workflow

---
 .github/workflows/slow_tests_gaudi2.yml | 28 +++++++++++++++++++++++--
 1 file changed, 26 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index 1a8ee5b909..a737cc7d3e 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -21,12 +21,15 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
@@ -47,17 +50,20 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest \
-            /bin/bash tests/ci/slow_tests_deepspeed.sh
+            /bin/bash pip install huggingface_hub && huggingface-cli --token ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} && tests/ci/slow_tests_deepspeed.sh
   fsdp:
     name: Test FSDP models
     if: ${{ !cancelled() && (success() || failure()) }}
@@ -73,12 +79,15 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
@@ -99,12 +108,15 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
@@ -126,13 +138,16 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
             -e RUN_ALBERT_XXL_1X=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
@@ -155,12 +170,15 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
@@ -181,12 +199,15 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
             -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
@@ -215,12 +236,15 @@ jobs:
       - name: Run tests
         run: |
             docker run \
+            --rm \
             -v $PWD:/root/workspace \
+            -v /scratch-1:/data \
             --workdir=/root/workspace \
             --runtime=habana \
             -e HABANA_VISIBLE_DEVICES=all \
-            -e GAUDI2_CI=1 \
             -e OMPI_MCA_btl_vader_single_copy_mechanism=none \
+            -e GAUDI2_CI=1 \
+            -e HF_HOME=/data \
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \

From 05683d32137e4d2b2517efa5d02fae8d57b3cc6b Mon Sep 17 00:00:00 2001
From: Neelesh Gokhale <neelesh.gokhale@intel.com>
Date: Mon, 9 Sep 2024 18:39:02 +0530
Subject: [PATCH 33/41] Fix Sentence Transformer HPU graphs for training with
 PEFT model (#1320)

---
 optimum/habana/sentence_transformers/st_gaudi_trainer.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/optimum/habana/sentence_transformers/st_gaudi_trainer.py b/optimum/habana/sentence_transformers/st_gaudi_trainer.py
index a443ad4f2c..3a17688b3b 100644
--- a/optimum/habana/sentence_transformers/st_gaudi_trainer.py
+++ b/optimum/habana/sentence_transformers/st_gaudi_trainer.py
@@ -46,6 +46,8 @@
 from transformers.trainer_utils import EvalLoopOutput
 from transformers.training_args import ParallelMode
 
+from optimum.habana.transformers.trainer import _is_peft_model
+
 from ..transformers import GaudiConfig, GaudiTrainer
 from .st_gaudi_training_args import SentenceTransformerGaudiTrainingArguments
 
@@ -224,7 +226,11 @@ def _wrap_model(self, model, training=True, dataloader=None):
         if self.args.use_hpu_graphs_for_training:
             import habana_frameworks.torch as ht
 
-            ht.hpu.ModuleCacher()(model=model, allow_unused_input=True, inplace=True)
+            if _is_peft_model(model):
+                base_model = model.get_base_model()
+                ht.hpu.ModuleCacher()(model=base_model, allow_unused_input=True, inplace=True)
+            else:
+                ht.hpu.ModuleCacher()(model=model, allow_unused_input=True, inplace=True)
 
         return model
 

From 1cb67e409169c37a50b9705a5a7b6462b5db700e Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Mon, 9 Sep 2024 13:34:40 +0000
Subject: [PATCH 34/41] Fix DeepSpeed command in Gaudi2 CI workflow

---
 .github/workflows/slow_tests_gaudi2.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index a737cc7d3e..a08b8fe17b 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -63,7 +63,7 @@ jobs:
             --net=host \
             --ipc=host \
             vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest \
-            /bin/bash pip install huggingface_hub && huggingface-cli --token ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} && tests/ci/slow_tests_deepspeed.sh
+            pip install huggingface_hub && huggingface-cli login --token ${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }} && /bin/bash tests/ci/slow_tests_deepspeed.sh
   fsdp:
     name: Test FSDP models
     if: ${{ !cancelled() && (success() || failure()) }}

From 45c114f82d2eea10745f650a684a16075f6075f6 Mon Sep 17 00:00:00 2001
From: regisss <15324346+regisss@users.noreply.github.com>
Date: Tue, 10 Sep 2024 07:53:16 +0000
Subject: [PATCH 35/41] Fix Gaudi2 CI workflow

---
 .github/workflows/slow_tests_gaudi2.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index a08b8fe17b..7de58a9341 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -134,7 +134,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest:latest
+            docker pull vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
       - name: Run tests
         run: |
             docker run \

From fa1fbc5f81ab1012263d36c4b73201db8ecd05e5 Mon Sep 17 00:00:00 2001
From: Soila Kavulya <soila.p.kavulya@intel.com>
Date: Tue, 10 Sep 2024 02:06:11 -0700
Subject: [PATCH 36/41] Fix ZeroDivisionError in constrained beam search with
 static shapes (#1317)

---
 optimum/habana/transformers/generation/utils.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index d333986679..d4baf44c06 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -3129,7 +3129,11 @@ def _constrained_beam_search(
 
         this_peer_finished = False
 
-        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+        # record the prompt length of decoder
+        if token_idx is not None:
+            decoder_prompt_len = cur_len
+        else:
+            decoder_prompt_len = input_ids.shape[-1]
 
         hb_profer = HabanaProfile(
             warmup=profiling_warmup_steps, active=profiling_steps, record_shapes=profiling_record_shapes

From feb6545202ef6812445bbca680eb3ad50ffe5aa8 Mon Sep 17 00:00:00 2001
From: Jimin Ha <jha@habana.ai>
Date: Wed, 11 Sep 2024 06:01:24 -0700
Subject: [PATCH 37/41] Update esmfold model not to use param_buffer_assignment
 (#1324)

---
 examples/protein-folding/run_esmfold.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/examples/protein-folding/run_esmfold.py b/examples/protein-folding/run_esmfold.py
index 4fa8d7a441..6941e6e5c1 100644
--- a/examples/protein-folding/run_esmfold.py
+++ b/examples/protein-folding/run_esmfold.py
@@ -82,6 +82,9 @@ def convert_outputs_to_pdb(outputs):
 test_protein = "MGAGASAEEKHSRELEKKLKEDAEKDARTVKLLLLGAGESGKSTIVKQMKIIHQDGYSLEECLEFIAIIYGNTLQSILAIVRAMTTLNIQYGDSARQDDARKLMHMADTIEEGTMPKEMSDIIQRLWKDSGIQACFERASEYQLNDSAGYYLSDLERLVTPGYVPTEQDVLRSRVKTTGIIETQFSFKDLNFRMFDVGGQRSERKKWIHCFEGVTCIIFIAALSAYDMVLVEDDEVNRMHESLHLFNSICNHRYFATTSIVLFLNKKDVFFEKIKKAHLSICFPDYDGPNTYEDAGNYIKVQFLELNMRRDVKEIYSHMTCATDTQNVKFVFDAVTDIIIKENLKDCGLF"  # len = 350
 
 tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
+# Set _supports_param_buffer_assignment to False since facebook/esmfold_v1's encoder weights are float16.
+# Without this fix, we will have the weights loaded with float16 on gaudi2,gaudi3 and runtime error on gaudi1
+EsmForProteinFolding._supports_param_buffer_assignment = False
 model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1", low_cpu_mem_usage=False)
 model = model.to(device)
 

From b2c29b1e54de02f2b19f49851a661c6481fa70c9 Mon Sep 17 00:00:00 2001
From: Yeonsil Yoon <yyoon@habana.ai>
Date: Wed, 11 Sep 2024 14:30:00 -0700
Subject: [PATCH 38/41] Falcon inference crash fix for falcon-40b model (#1161)

---
 .../models/falcon/modeling_falcon.py          | 86 +++++++++++--------
 1 file changed, 49 insertions(+), 37 deletions(-)

diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py
index a7a0c0e920..52fc649948 100644
--- a/optimum/habana/transformers/models/falcon/modeling_falcon.py
+++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py
@@ -87,6 +87,40 @@ def gaudi_falcon_linear_forward(self, input: torch.Tensor) -> torch.Tensor:
     return hidden_states
 
 
+def repeat_kv(
+    query_states: torch.Tensor,
+    key_states: torch.Tensor,
+    value_states: torch.Tensor,
+    attention_mask: torch.Tensor,
+    n_rep: int,
+):
+    """
+    Copied from repeat_kv: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
+    The only differences are:
+        - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
+        - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
+    The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
+    The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, kv_len, head_dim = key_states.shape
+    if n_rep == 1 or num_key_value_heads == 1:
+        return query_states, key_states, value_states, attention_mask
+
+    new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
+    key_states = key_states.reshape(new_kv_shape)
+    value_states = value_states.reshape(new_kv_shape)
+
+    batch, _, q_len, head_dim = query_states.shape
+    new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
+    query_states = query_states.reshape(new_q_shape)
+
+    if attention_mask is not None:
+        # Add groups dim and set to 1
+        attention_mask = attention_mask.unsqueeze(1)
+
+    return query_states, key_states, value_states, attention_mask
+
+
 #  FusedScaledDotProductAttention
 class ModuleFusedSDPA(torch.nn.Module):
     def __init__(self, fusedSDPA):
@@ -123,40 +157,6 @@ def __init__(self, config: FalconConfig):
         self.softmax = Softmax()
         self.num_key_value_groups = config.num_attention_heads // config.num_kv_heads
 
-    def repeat_kv(
-        self,
-        query_states: torch.Tensor,
-        key_states: torch.Tensor,
-        value_states: torch.Tensor,
-        attention_mask: torch.Tensor,
-        n_rep: int,
-    ):
-        """
-        Copied from repeat_kv: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
-        The only differences are:
-            - Append num_key_value_heads == 1 check as kv states can be broadcasted during matmuls so need to expand and reshape them.
-            - Add new args query_states, key_states, value_states and attention_mask and update the logic for expansion.
-        The query states go from (batch, num_heads, seqlen, head_dim) to (batch, num_key_value_heads, n_rep, seqlen, head_dim)
-        The key/value states go from (batch, num_key_value_heads, seqlen, head_dim) to (batch, num_key_value_heads, 1, seqlen, head_dim)
-        """
-        batch, num_key_value_heads, kv_len, head_dim = key_states.shape
-        if n_rep == 1 or num_key_value_heads == 1:
-            return query_states, key_states, value_states, attention_mask
-
-        new_kv_shape = (batch, num_key_value_heads, 1, kv_len, head_dim)
-        key_states = key_states.reshape(new_kv_shape)
-        value_states = value_states.reshape(new_kv_shape)
-
-        batch, _, q_len, head_dim = query_states.shape
-        new_q_shape = (batch, num_key_value_heads, n_rep, q_len, head_dim)
-        query_states = query_states.reshape(new_q_shape)
-
-        if attention_mask is not None:
-            # Add groups dim and set to 1
-            attention_mask = attention_mask.unsqueeze(1)
-
-        return query_states, key_states, value_states, attention_mask
-
     def forward(self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
         L, S = query.size(-2), key.size(-2)
         scale_factor = 1 / math.sqrt(self.head_dim)
@@ -173,7 +173,7 @@ def forward(self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=Fa
             if attn_mask.dtype == torch.bool:
                 attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
 
-        query, key, value, attn_mask = self.repeat_kv(query, key, value, attn_mask, self.num_key_value_groups)
+        query, key, value, attn_mask = repeat_kv(query, key, value, attn_mask, self.num_key_value_groups)
 
         attn_weight = self.bmm1(query, key.transpose(-2, -1))
         attn_weight += attn_mask
@@ -262,7 +262,7 @@ def __init__(self, config: FalconConfig):
         # TODO, Does this affect memory usage?
         if self.is_fp8:
             self.fused_scaled_dot_product_attention = ModuleFusedSDPA(FusedSDPA)
-        self.unfused_scaled_dot_product_attention = ScaledDotProductAttention(config)
+            self.unfused_scaled_dot_product_attention = ScaledDotProductAttention(config)
 
         self.k_cache = KVCache()
         self.v_cache = KVCache()
@@ -353,7 +353,11 @@ def pre_attn_forward(
 
         train_with_flash_attention = self.training and self._use_sdpa and not output_attentions and head_mask is None
         (query_layer, key_layer, value_layer) = self._split_heads(
-            fused_qkv, not use_flash_attention and not self.is_fp8 and not train_with_flash_attention
+            fused_qkv,
+            not use_flash_attention
+            and not self.is_fp8
+            and not train_with_flash_attention
+            and not (self.config.num_kv_heads == 8),
         )
 
         batch_size, query_length, _, _ = query_layer.shape
@@ -462,6 +466,14 @@ def pre_attn_forward(
                             query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal=False
                         )
                     else:
+                        if query_layer.shape != key_layer.shape:
+                            query_layer, key_layer, value_layer, attention_mask = repeat_kv(
+                                query_layer,
+                                key_layer,
+                                value_layer,
+                                attention_mask,
+                                self.config.num_attention_heads // self.config.num_kv_heads,
+                            )
                         # Workaround util scaled_dot_product_attention support broadcast.
                         if self.training is True and query_layer.shape != key_layer.shape:
                             key_layer = torch.broadcast_to(key_layer, query_layer.shape)

From 0027e320b3f6c3f8a98d733ae9a42901de8a1e9d Mon Sep 17 00:00:00 2001
From: Daniel Socek <daniel.socek@intel.com>
Date: Wed, 11 Sep 2024 23:36:40 +0200
Subject: [PATCH 39/41] Boost SDXL speed with initialized schedule step reset
 (#1284)

Signed-off-by: Daniel Socek <daniel.socek@intel.com>
---
 .../stable_diffusion_xl/pipeline_stable_diffusion_xl.py       | 4 ++--
 .../pipeline_stable_diffusion_xl_img2img.py                   | 2 ++
 .../pipeline_stable_diffusion_xl_inpaint.py                   | 2 ++
 3 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
index 8785dbbd26..0cd0cd28dd 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl.py
@@ -651,6 +651,8 @@ def __call__(
             t1 = t0
 
             self._num_timesteps = len(timesteps)
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index()
 
             hb_profiler = HabanaProfile(
                 warmup=profiling_warmup_steps,
@@ -688,8 +690,6 @@ def __call__(
                     guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim
                 ).to(device=device, dtype=latents.dtype)
 
-            self._num_timesteps = len(timesteps)
-
             # 8.3 Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
             use_warmup_inference_steps = (
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
index 79fad16655..7b6f25d920 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_img2img.py
@@ -536,6 +536,8 @@ def denoising_value_valid(dnv):
                 ).to(device=device, dtype=latents.dtype)
 
             self._num_timesteps = len(timesteps)
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index()
 
             # 8.3 Denoising loop
             throughput_warmup_steps = kwargs.get("throughput_warmup_steps", 3)
diff --git a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
index accb64fd6a..8d94596e3b 100644
--- a/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
+++ b/optimum/habana/diffusers/pipelines/stable_diffusion_xl/pipeline_stable_diffusion_xl_inpaint.py
@@ -744,6 +744,8 @@ def denoising_value_valid(dnv):
                 ).to(device=device, dtype=latents.dtype)
 
             self._num_timesteps = len(timesteps)
+            if hasattr(self.scheduler, "set_begin_index"):
+                self.scheduler.set_begin_index()
 
             outputs = {
                 "images": [],

From f87f0fb4d9e965b540cfec88f802c2dcdfe1fcc8 Mon Sep 17 00:00:00 2001
From: Thanaji Rao Thakkalapelli <tthakkalapelli@habana.ai>
Date: Sat, 14 Sep 2024 09:19:04 -0700
Subject: [PATCH 40/41] Enable INC for llava models and change softmax to use
 torch.nn.functional.softmax as its supported module by INC (#1325)

---
 examples/image-to-text/README.md              |  3 +-
 examples/image-to-text/run_pipeline.py        | 48 ++++++++++++++++---
 .../transformers/models/clip/modeling_clip.py |  2 +-
 3 files changed, 44 insertions(+), 9 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 97494e6846..6cde1313bb 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -73,8 +73,7 @@ python3 run_pipeline.py \
 ```
 
 ### Inference with FP8
-
-Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
+Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
 More information on enabling FP8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index 8ef45c4f61..e93982e33f 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -36,6 +36,46 @@
 logger = logging.getLogger(__name__)
 
 
+def setup_quantization(model, args):
+    if os.getenv("USE_INC", "1") != "0":
+        try:
+            from neural_compressor.torch.quantization import FP8Config, convert, prepare
+        except ImportError:
+            raise ImportError(
+                "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
+            )
+
+        config = FP8Config.from_json_file(args.quant_config)
+        if config.measure:
+            model = prepare(model, config)
+        elif config.quantize:
+            model = convert(model, config)
+    else:
+        import habana_frameworks.torch.core as htcore
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.prep_model(model)
+        htcore.hpu_initialize(model)
+
+    return model
+
+
+def finalize_quantization(model):
+    if os.getenv("USE_INC", "1") != "0":
+        try:
+            from neural_compressor.torch.quantization import finalize_calibration
+        except ImportError:
+            raise ImportError(
+                "Module neural_compressor is missing. Please use a newer Synapse version to use quantization, or set the environment variable to USE_INC=0"
+            )
+
+        finalize_calibration(model)
+    else:
+        import habana_quantization_toolkit
+
+        habana_quantization_toolkit.finish_measurements(model)
+
+
 def main():
     parser = argparse.ArgumentParser()
 
@@ -169,18 +209,14 @@ def main():
         generator.model = wrap_in_hpu_graph(generator.model)
 
     if args.quant_config:
-        import habana_quantization_toolkit
-
-        habana_quantization_toolkit.prep_model(generator.model)
-
-        htcore.hpu_initialize(generator.model)
+        generator.model = setup_quantization(generator.model, args)
 
     # warm up
     for i in range(args.warmup):
         generator(images, prompt=args.prompt, batch_size=args.batch_size, generate_kwargs=generate_kwargs)
     torch.hpu.synchronize()
     if args.quant_config:
-        habana_quantization_toolkit.finish_measurements(generator.model)
+        finalize_quantization(generator.model)
 
     start = time.perf_counter()
     for i in range(args.n_iterations):
diff --git a/optimum/habana/transformers/models/clip/modeling_clip.py b/optimum/habana/transformers/models/clip/modeling_clip.py
index b7fb3a222e..96b03ab32a 100644
--- a/optimum/habana/transformers/models/clip/modeling_clip.py
+++ b/optimum/habana/transformers/models/clip/modeling_clip.py
@@ -60,7 +60,7 @@ def __init__(self):
         super().__init__()
 
     def forward(self, x, dim=None, invAttnHead=None):
-        return torch.ops.hpu.softmax_fp8(x, dim, None, None, invAttnHead)
+        return torch.nn.functional.softmax(x, dim)
 
 
 class GaudiCLIPAttention(CLIPAttention):

From 520c875807ef2a5b7d9bd96df335d4d0e2487f43 Mon Sep 17 00:00:00 2001
From: Kim  Yann <kimyangbaochen@vip.qq.com>
Date: Sun, 15 Sep 2024 00:23:19 +0800
Subject: [PATCH 41/41] Add `--use_kv_cache` to image-to-text pipeline (#1292)

---
 examples/image-to-text/README.md       | 20 ++++++++++++++++
 examples/image-to-text/run_pipeline.py | 32 +++++++++++++++++++-------
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/examples/image-to-text/README.md b/examples/image-to-text/README.md
index 6cde1313bb..2ac99dc829 100644
--- a/examples/image-to-text/README.md
+++ b/examples/image-to-text/README.md
@@ -28,6 +28,8 @@ Models that have been validated:
   - [llava-hf/llava-v1.6-mistral-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-mistral-7b-hf)
   - [llava-hf/llava-v1.6-vicuna-7b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-7b-hf)
   - [llava-hf/llava-v1.6-vicuna-13b-hf](https://huggingface.co/llava-hf/llava-v1.6-vicuna-13b-hf)
+  - [llava-hf/llava-v1.6-34b-hf](https://huggingface.co/llava-hf/llava-v1.6-34b-hf)
+  - [llava-hf/llama3-llava-next-8b-hf](https://huggingface.co/llava-hf/llama3-llava-next-8b-hf)
 
 ### Inference with BF16
 
@@ -72,6 +74,24 @@ python3 run_pipeline.py \
     --bf16
 ```
 
+To run Llava-hf/llava-v1.6-34b-hf inference, use the following command:
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path llava-hf/llava-v1.6-34b-hf \
+    --use_hpu_graphs \
+    --bf16
+```
+
+To run Llava-hf/llama3-llava-next-8b-hf inference, use the following command:
+
+```bash
+python3 run_pipeline.py \
+    --model_name_or_path llava-hf/llama3-llava-next-8b-hf \
+    --use_hpu_graphs \
+    --bf16
+```
+
 ### Inference with FP8
 Inference for Llava-1.5-7b, Llava-1.5-13b, Llava-v1.6-mistral-7b and Llava-v1.6-vicuna-13b in FP8 precision are enabled using  [Intel Neural Compressor (INC)](https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html), which provides model measurement and quantization capabilities in PyTorch. INC is used by default for measuring and quantization. Habana Quantization Toolkit (HQT), which was used earlier, will be removed in future releases. To use HQT, disable INC by setting the following environment variable: `USE_INC=0`.
 
diff --git a/examples/image-to-text/run_pipeline.py b/examples/image-to-text/run_pipeline.py
index e93982e33f..9161285881 100644
--- a/examples/image-to-text/run_pipeline.py
+++ b/examples/image-to-text/run_pipeline.py
@@ -23,7 +23,7 @@
 import PIL.Image
 import requests
 import torch
-from transformers import AutoConfig, pipeline
+from transformers import AutoConfig, LlavaNextProcessor, LlavaProcessor, pipeline
 
 from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
 
@@ -141,6 +141,11 @@ def main():
         action="store_true",
         help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.",
     )
+    parser.add_argument(
+        "--use_kv_cache",
+        action="store_true",
+        help="Whether to use the key/value cache for decoding. It should speed up generation.",
+    )
 
     args = parser.parse_args()
 
@@ -156,12 +161,21 @@ def main():
         args.image_path = [
             "https://github.com/haotian-liu/LLaVA/blob/1a91fc274d7c35a9b50b3cb29c4247ae5837ce39/images/llava_v1_5_radar.jpg?raw=true"
         ]
-    if args.prompt is None and model_type == "llava":
-        args.prompt = "<image>\nUSER: What's the content of the image?\nASSISTANT:"
-    elif args.prompt is None and model_type == "llava_next":
-        args.prompt = "[INST] <image>\nWhat is shown in this image? [/INST]"
-        if args.model_name_or_path in ["llava-hf/llava-v1.6-vicuna-13b-hf", "llava-hf/llava-v1.6-vicuna-7b-hf"]:
-            args.prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: <image>\nWhat is shown in this image? ASSISTANT:"
+    if args.prompt is None:
+        if model_type == "llava":
+            processor = LlavaProcessor.from_pretrained(args.model_name_or_path)
+        elif model_type == "llava_next":
+            processor = LlavaNextProcessor.from_pretrained(args.model_name_or_path)
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": "What is shown in this image?"},
+                    {"type": "image"},
+                ],
+            }
+        ]
+        args.prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
 
     image_paths = args.image_path
     image_paths_len = len(image_paths)
@@ -197,6 +211,7 @@ def main():
     )
     generate_kwargs = {
         "lazy_mode": True,
+        "use_cache": args.use_kv_cache,
         "hpu_graphs": args.use_hpu_graphs,
         "max_new_tokens": args.max_new_tokens,
         "ignore_eos": args.ignore_eos,
@@ -233,8 +248,9 @@ def main():
 
     total_new_tokens_generated = args.n_iterations * n_output_tokens
     throughput = total_new_tokens_generated / duration
+    logger.info(f"result = {result}")
     logger.info(
-        f"result = {result}, time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
+        f"time = {(end-start) * 1000 / args.n_iterations }ms, Throughput (including tokenization) = {throughput} tokens/second"
     )
 
     # Store results if necessary