From fa8db5cf2f9cf724fd2703353d40e3b37a8e7310 Mon Sep 17 00:00:00 2001
From: Yejing-Lai <yejing.lai@intel.com>
Date: Sat, 11 Jan 2025 06:18:01 +0800
Subject: [PATCH 1/3] Support pure meta model lm_head tp (#6812)

Add lm_head tp support when checkpoint not provided to
deepspeed.init_inference().

---------

Co-authored-by: Logan Adams <114770087+loadams@users.noreply.github.com>
Co-authored-by: Ma, Guokai <guokai.ma@gmail.com>
---
 deepspeed/module_inject/replace_module.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/deepspeed/module_inject/replace_module.py b/deepspeed/module_inject/replace_module.py
index e59f84bc8453..00b22aac81d8 100644
--- a/deepspeed/module_inject/replace_module.py
+++ b/deepspeed/module_inject/replace_module.py
@@ -342,13 +342,11 @@ def set_lm_head(module):
                 module.lm_head, "weight") and module.lm_head.weight.is_meta:
             module.lm_head.weight = embedding_weight
         # enable tensor parallel for the last linear
-        if hasattr(module, "lm_head") and hasattr(module.lm_head,
-                                                  "weight") and not module.lm_head.weight.is_meta and isinstance(
-                                                      module.lm_head, torch.nn.Linear):
+        if hasattr(module, "lm_head") and hasattr(module.lm_head, "weight") and isinstance(
+                module.lm_head, torch.nn.Linear):
             module = replace_wo_policy(module, ("lm_head", ), 0, "lm_head")
-        elif hasattr(module, "embed_out") and hasattr(module.embed_out,
-                                                      "weight") and not module.embed_out.weight.is_meta and isinstance(
-                                                          module.embed_out, torch.nn.Linear):
+        elif hasattr(module, "embed_out") and hasattr(module.embed_out, "weight") and isinstance(
+                module.embed_out, torch.nn.Linear):
             module = replace_wo_policy(module, ("embed_out", ), 0, "embed_out")
         elif hasattr(module, "language_model") and hasattr(module.language_model, "lm_head"):
             module = replace_wo_policy(module.language_model, ("lm_head", ), 0, "lm_head")
@@ -389,7 +387,6 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                                              checkpoint=checkpoint_file)
             pbar.update(1)
             gc.collect()
-        replaced_module = set_lm_head(replaced_module)
         # conv2d tp module replace
         # Now is for yuan model. Add model list and conv policy to decide whether to replace conv.
         if 'Yuan' in str(replaced_module):
@@ -399,6 +396,9 @@ def conv2d_parallel_shard_weights(model, rank, world_size):
                                          orig_class=orig_layer_impl,
                                          replace_fn=replace_fn,
                                          _replace_policy=config.injection_policy_tuple)
+    # AutoTP default set lm_head tp
+    if not config.replace_with_kernel_inject:
+        replaced_module = set_lm_head(replaced_module)
 
     quantizer = GroupQuantizer(q_int8=quantize)
     world_size = dist.get_world_size() if dist.is_initialized() else 1

From 396f8db793b37db9b11847df8245f85bc57eeaa3 Mon Sep 17 00:00:00 2001
From: Nir Sonnenschein <nsonnenschein@habana.ai>
Date: Mon, 13 Jan 2025 18:50:22 +0200
Subject: [PATCH 2/3] Remove op compilation flags due to perf issue (#6944)

in some scenarios some of the optimization
flags for the ops compiler for HPU can cause
a significant performance degradation.
remove the flags until the issue is resolved
---
 op_builder/hpu/builder.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/op_builder/hpu/builder.py b/op_builder/hpu/builder.py
index c176a586ba49..11e710a8ee48 100644
--- a/op_builder/hpu/builder.py
+++ b/op_builder/hpu/builder.py
@@ -32,9 +32,6 @@ def builder(self):
 
     def cxx_args(self):
         args = ['-O3', '-g', '-Wno-reorder']
-        CPU_ARCH = self.cpu_arch()
-        SIMD_WIDTH = self.simd_width()
-        args += [CPU_ARCH, '-fopenmp', SIMD_WIDTH]
         return args
 
     def libraries_args(self):

From 66d3d3e94dbdfbbf6535cab66256c238983fc7c3 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Mon, 13 Jan 2025 10:34:15 -0800
Subject: [PATCH 3/3] Pin nv-a6000 workflow (#6938)

Breaking change in transformers is
https://github.com/huggingface/transformers/pull/35235. Need to make
changes to unpin nv-a6000 workflow.
---
 .github/workflows/nv-a6000.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/nv-a6000.yml b/.github/workflows/nv-a6000.yml
index 639f27498dd9..0547431e3099 100644
--- a/.github/workflows/nv-a6000.yml
+++ b/.github/workflows/nv-a6000.yml
@@ -40,8 +40,9 @@ jobs:
           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
       - name: Install transformers
         run: |
-          git clone --depth=1 https://github.com/huggingface/transformers
+          git clone https://github.com/huggingface/transformers
           cd transformers
+          git checkout v4.47.1
           git rev-parse --short HEAD
           python -m pip install .
       - name: Install deepspeed