diff --git a/src/runtime/model.cc b/src/runtime/model.cc index f4b3f4054a..e488916f53 100644 --- a/src/runtime/model.cc +++ b/src/runtime/model.cc @@ -3431,6 +3431,7 @@ bool FFModel::need_to_add_allreduce(int layer_idx) const { return false; } +#ifdef DEADCODE bool FFModel::need_to_add_parallel_identity(int layer_idx) const { auto const &l = layers[layer_idx]; // add parallel identity (allreduce in the backward pass) before the lm head @@ -3457,6 +3458,26 @@ bool FFModel::need_to_add_parallel_identity(int layer_idx) const { } return false; } +#endif +bool FFModel::need_to_add_parallel_identity(int layer_idx) const { + auto const &l = layers[layer_idx]; + // add parallel identity (allreduce in the backward pass) before the lm head + // we find the lm head by looking for the linear layer right after a residual + // rms norm / layer norm, and before a softmax, followed by + // argmax/argtopk/sampling + if (config.computationMode == COMP_MODE_INFERENCE && + config.tensor_parallelism_degree > 1 && + ((l->op_type == OP_RESIDUAL_RMS_NORM || + l->op_type == OP_RESIDUAL_LAYERNORM) && + // there are at least 2 layers before the norm, and at least 1 following + // the norm + layer_idx >= 2 && layer_idx < layers.size() - 1 && + // norm is followed by linear layer (lm head) + layers[layer_idx + 1]->op_type == OP_LINEAR)) { + return true; + } + return false; +} void FFModel::create_operators_from_layers() { std::map tensors_to_parallel_tensors; diff --git a/tests/peft/peft_alignment_test.py b/tests/peft/peft_alignment_test.py index 360dcb0d38..5843ffa3d9 100644 --- a/tests/peft/peft_alignment_test.py +++ b/tests/peft/peft_alignment_test.py @@ -536,7 +536,7 @@ def compare(hf_tensor, ff_tensor, label="", additional_ff_tensor=None, tolerance ff_tensor_name = f"layers.{i}.layers.{i}.self_attn" output_comparison = TensorComparisonIdxs(hf_tensor_type="output_gradient", ff_tensor_type="output_gradient", hf_tensor_idx=0, ff_tensor_idx=0) hf_tensor = get_hf_tensor(hf_tensor_name, output_comparison) - ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.TO_REDUCE) + ff_tensor = get_ff_tensor(ff_tensor_name, output_comparison, hf_tensor.shape, tp_type=TPType.REPLICATE) compare(hf_tensor, ff_tensor, label=f"Attn O-proj {i} gradient output") ff_tensor_name = f"layers.{i}.layers.{i}.self_attn.o_proj" input_comparison = TensorComparisonIdxs(hf_tensor_type="input_gradient", ff_tensor_type="input_gradient", hf_tensor_idx=0, ff_tensor_idx=0)