diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp index 81a5a3b83a..3ee827c2a4 100644 --- a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp +++ b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp @@ -303,8 +303,7 @@ void LayoutPropagation::initAnchorLayout() { // back to mma further down to avoid generating reduction with MMA // layout that may have lower performance. // This can be improved with more aggressive backward propagation. - // FIXME: Change back NvidiaMmaEncodingAttr to MmaEncodingTrait. - if (isa(tensorType.getEncoding()) && + if (isa(tensorType.getEncoding()) && v.getDefiningOp() && !hasConvertToMMATransisitiveUse(v.getDefiningOp(), tensorType.getEncoding())) { diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py index 0238ca5916..b1b159e02e 100644 --- a/third_party/intel/backend/compiler.py +++ b/third_party/intel/backend/compiler.py @@ -177,16 +177,16 @@ def make_ttgir(mod, metadata, opt, device_arch): passes.ttir.add_convert_to_ttgpuir(pm, f"xpu:{device_arch}", opt.num_warps, opt.threads_per_warp, opt.num_ctas) # optimize TTGIR passes.ttgpuir.add_coalesce(pm) - passes.ttgpuir.add_remove_layout_conversions(pm) + intel.passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_optimize_thread_locality(pm) intel.passes.ttgpuir.add_accelerate_matmul(pm, device_arch) - passes.ttgpuir.add_remove_layout_conversions(pm) + intel.passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) passes.common.add_cse(pm) passes.ttgpuir.add_prefetch(pm) passes.ttgpuir.add_optimize_dot_operands(pm, True) - passes.ttgpuir.add_remove_layout_conversions(pm) + intel.passes.ttgpuir.add_remove_layout_conversions(pm) passes.ttgpuir.add_reduce_data_duplication(pm) passes.ttgpuir.add_reorder_instructions(pm) passes.common.add_cse(pm) diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp index 6cff8ce891..7953ce821b 100644 --- a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp +++ b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp @@ -232,12 +232,16 @@ bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) { if (auto mmaLayout = dyn_cast(dstEncoding)) return (mmaLayout.getVersionMajor() > 1) ? true : mmaLayout == encoding; + if (isa(dstEncoding)) + return true; if (isa(dstEncoding)) return true; if (isa(dstEncoding)) { if (auto mmaLayout = dyn_cast(encoding)) { return mmaLayout.getVersionMajor() > 1; + } else if (isa(encoding)) { + return true; } else { assert((mlir::isa(encoding))); @@ -1093,6 +1097,12 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() { void LayoutRematerialization::backwardRematerialization( ConvertLayoutOp convertOp) { RankedTensorType targetType = convertOp.getType(); + // We don't backward propagate the dot layout with blocked layout as parent. + // It introduces a lot of duplicated values in multiple-threads. + if (auto dotLayout = + dyn_cast(targetType.getEncoding())) + if (dotLayout.getParent().isa()) + return; Value oldV = convertOp->getOperand(0); LDBG("check backward remat with source " << oldV << " encoding " << targetType.getEncoding());