diff --git a/src/runtime/model.cc b/src/runtime/model.cc
index f4b3f4054a..e488916f53 100644
--- a/src/runtime/model.cc
+++ b/src/runtime/model.cc
@@ -3431,6 +3431,7 @@ bool FFModel::need_to_add_allreduce(int layer_idx) const {
   return false;
 }
 
+#ifdef DEADCODE
 bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
   auto const &l = layers[layer_idx];
   // add parallel identity (allreduce in the backward pass) before the lm head
@@ -3457,6 +3458,26 @@ bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
   }
   return false;
 }
+#endif
+bool FFModel::need_to_add_parallel_identity(int layer_idx) const {
+  auto const &l = layers[layer_idx];
+  // add parallel identity (allreduce in the backward pass) before the lm head
+  // we find the lm head by looking for the linear layer right after a residual
+  // rms norm / layer norm, and before a softmax, followed by
+  // argmax/argtopk/sampling
+  if (config.computationMode == COMP_MODE_INFERENCE &&
+      config.tensor_parallelism_degree > 1 &&
+      ((l->op_type == OP_RESIDUAL_RMS_NORM ||
+        l->op_type == OP_RESIDUAL_LAYERNORM) &&
+       // there are at least 2 layers before the norm, and at least 1 following
+       // the norm
+       layer_idx >= 2 && layer_idx < layers.size() - 1 &&
+       // norm is followed by linear layer (lm head)
+       layers[layer_idx + 1]->op_type == OP_LINEAR)) {
+    return true;
+  }
+  return false;
+}
 
 void FFModel::create_operators_from_layers() {
   std::map<const Tensor, ParallelTensor> tensors_to_parallel_tensors;
diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py
index 360dcb0d38..5843ffa3d9 100644
--- a/tests/peft/peft_alignment_test.py
+++ b/tests/peft/peft_alignment_test.py
@@ -536,7 +536,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance
             ff_tensor_name = f"layers.{i}.layers.{i}.self_attn"
             output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0)
             hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison)
-            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE)
+            ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE)
             compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output")
             ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj"
             input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)