Skip to content

Commit

Permalink
Address review comment part 1
Browse files Browse the repository at this point in the history
  • Loading branch information
LiyangLingIntel committed Apr 24, 2024
1 parent 67491e5 commit 1186477
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 25 deletions.
4 changes: 2 additions & 2 deletions test/TritonIntelGPU/load-to-llvm-2dload.mlir
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
// RUN: triton-opt %s --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm

// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<8xi32>
// CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<8xi16>
#blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [1, 0]}>
#dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], A = [8, 16], B = [16, 16], C = [8, 16]}>
module attributes {"triton_gpu.compute-capability" = 2 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
tt.func public @matmul_no_scf_with_advance_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
tt.func public @matmul_no_scf_with_advance_kernel(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
%cst = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #dpas>
%c32_i32 = arith.constant 32 : i32
%c-64_i32 = arith.constant -64 : i32
Expand Down
3 changes: 0 additions & 3 deletions test/TritonIntelGPU/rewrite-tensor-pointer.mlir
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
// RUN: triton-opt %s -tritonintelgpu-rewrite-tensor-pointer | FileCheck %s
// FIXME
// XFAIL: *

tt.func public @matmul_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) {
%c31_i32 = arith.constant 31 : i32
%c127_i32 = arith.constant 127 : i32
Expand Down
34 changes: 14 additions & 20 deletions third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,8 @@ struct LoadOpConversion
Value ptr = op.getPtr();
Value mask = op.getMask();
Value other = op.getOther();
Type resultTy = op.getType();
RankedTensorType tensorType = resultTy.cast<RankedTensorType>();
Type resultType = op.getType();
auto tensorType = cast<RankedTensorType>(resultType);

// Only lower loadOp with dpas layout encoding
Attribute layoutEncoding = tensorType.getEncoding();
Expand All @@ -188,7 +188,7 @@ struct LoadOpConversion
unsigned opIdx = dotLayout.getOpIdx();
Type eltTy = tensorType.getElementType();
const ArrayRef<int64_t> tensorShape = tensorType.getShape();
unsigned numElems = getTotalElemsPerThread(resultTy);
unsigned numElems = getTotalElemsPerThread(resultType);
SmallVector<int64_t> numReps =
dpasLayout.getDPASRepetitions(tensorShape, dotLayout.getOpIdx());
const SmallVector<unsigned> warpsPerCTA = dpasLayout.getWarpsPerCTA();
Expand All @@ -201,8 +201,6 @@ struct LoadOpConversion
SmallVector<Value> multiDimWarpId =
delinearize(rewriter, loc, warpId, warpsPerCTA, order);

Type load2DGenXType;

SmallVector<unsigned> operandShape =
opIdx == 0 ? dpasLayout.getShapeA() : dpasLayout.getShapeB();
SmallVector<int64_t> elemsPerInstr = {operandShape[0], operandShape[1]};
Expand All @@ -219,7 +217,7 @@ struct LoadOpConversion
} else {
elemsPerLane = elemsPerLane / opsPerChannel;
}
load2DGenXType = LLVM::getFixedVectorType(elemType, elemsPerLane);
Type load2DGenXType = LLVM::getFixedVectorType(elemType, elemsPerLane);

// Outer dim, A is the M, B is the N. Inner dim, the K
int outerDimWarpNum = std::min<int>(
Expand All @@ -239,20 +237,16 @@ struct LoadOpConversion
SmallVector<Value> rets;
for (int outer = 0; outer < numRepOuter; ++outer) {
for (int k = 0; k < numRepK; ++k) {
Value offsetX, offsetY;
if (opIdx == 0) {
// A
offsetY =
add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]));
offsetX = i32_val(k * elemsPerInstr[1]);
} else {
// B
offsetX =
add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]));
offsetY = i32_val(k * elemsPerInstr[0]);
}
Value offsetX =
(opIdx == 0)
? i32_val(k * elemsPerInstr[1])
: add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]));
Value offsetY =
(opIdx == 0)
? add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]))
: i32_val(k * elemsPerInstr[0]);
offsetX = add(offsetX, offsetBaseX);
offsetY = add(offsetY, offsetBaseY);
width = rewriter.create<arith::TruncIOp>(loc, i32_ty, width);
Expand Down

0 comments on commit 1186477

Please sign in to comment.