Skip to content

Commit

Permalink
[RemoveLayoutConversions] Switch to use intel specialized version (#1071
Browse files Browse the repository at this point in the history
)

Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
  • Loading branch information
whitneywhtsang authored May 9, 2024
1 parent cb75978 commit 6ef229a
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 5 deletions.
3 changes: 1 addition & 2 deletions lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -303,8 +303,7 @@ void LayoutPropagation::initAnchorLayout() {
// back to mma further down to avoid generating reduction with MMA
// layout that may have lower performance.
// This can be improved with more aggressive backward propagation.
// FIXME: Change back NvidiaMmaEncodingAttr to MmaEncodingTrait.
if (isa<NvidiaMmaEncodingAttr>(tensorType.getEncoding()) &&
if (isa<MmaEncodingTrait>(tensorType.getEncoding()) &&
v.getDefiningOp() &&
!hasConvertToMMATransisitiveUse(v.getDefiningOp(),
tensorType.getEncoding())) {
Expand Down
6 changes: 3 additions & 3 deletions third_party/intel/backend/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,16 +177,16 @@ def make_ttgir(mod, metadata, opt, device_arch):
passes.ttir.add_convert_to_ttgpuir(pm, f"xpu:{device_arch}", opt.num_warps, opt.threads_per_warp, opt.num_ctas)
# optimize TTGIR
passes.ttgpuir.add_coalesce(pm)
passes.ttgpuir.add_remove_layout_conversions(pm)
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
passes.ttgpuir.add_optimize_thread_locality(pm)

intel.passes.ttgpuir.add_accelerate_matmul(pm, device_arch)
passes.ttgpuir.add_remove_layout_conversions(pm)
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
passes.ttgpuir.add_optimize_dot_operands(pm, True)
passes.common.add_cse(pm)
passes.ttgpuir.add_prefetch(pm)
passes.ttgpuir.add_optimize_dot_operands(pm, True)
passes.ttgpuir.add_remove_layout_conversions(pm)
intel.passes.ttgpuir.add_remove_layout_conversions(pm)
passes.ttgpuir.add_reduce_data_duplication(pm)
passes.ttgpuir.add_reorder_instructions(pm)
passes.common.add_cse(pm)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -232,12 +232,16 @@ bool hasConvertToMMATransisitiveUse(Operation *op, Attribute encoding) {
if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(dstEncoding))
return (mmaLayout.getVersionMajor() > 1) ? true
: mmaLayout == encoding;
if (isa<ttgi::DpasEncodingAttr>(dstEncoding))
return true;
if (isa<triton::gpu::AMDMfmaEncodingAttr,
triton::gpu::AMDWmmaEncodingAttr>(dstEncoding))
return true;
if (isa<triton::gpu::DotOperandEncodingAttr>(dstEncoding)) {
if (auto mmaLayout = dyn_cast<NvidiaMmaEncodingAttr>(encoding)) {
return mmaLayout.getVersionMajor() > 1;
} else if (isa<ttgi::DpasEncodingAttr>(encoding)) {
return true;
} else {
assert((mlir::isa<triton::gpu::AMDMfmaEncodingAttr,
triton::gpu::AMDWmmaEncodingAttr>(encoding)));
Expand Down Expand Up @@ -1093,6 +1097,12 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast() {
void LayoutRematerialization::backwardRematerialization(
ConvertLayoutOp convertOp) {
RankedTensorType targetType = convertOp.getType();
// We don't backward propagate the dot layout with blocked layout as parent.
// It introduces a lot of duplicated values in multiple-threads.
if (auto dotLayout =
dyn_cast<DotOperandEncodingAttr>(targetType.getEncoding()))
if (dotLayout.getParent().isa<BlockedEncodingAttr>())
return;
Value oldV = convertOp->getOperand(0);
LDBG("check backward remat with source " << oldV << " encoding "
<< targetType.getEncoding());
Expand Down

0 comments on commit 6ef229a

Please sign in to comment.