Address review comment part 1

intel · Apr 24, 2024 · 1186477 · 1186477
1 parent 67491e5
commit 1186477
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 25 deletions.
diff --git a/test/TritonIntelGPU/load-to-llvm-2dload.mlir b/test/TritonIntelGPU/load-to-llvm-2dload.mlir
@@ -1,11 +1,11 @@
-// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
+// RUN: triton-opt %s --intel-allocate-shared-memory --convert-triton-intel-gpu-to-llvm | FileCheck %s --implicit-check-not=llvm.inline_asm
 
 // CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i32(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<8xi32>
 // CHECK: llvm.func spir_funccc @llvm.genx.GenISA.LSC2DBlockRead.v8i16(i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i1, i1, i32) -> vector<8xi16>
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 4], order = [1, 0]}>
 #dpas = #triton_intel_gpu.dpas<{repeatCount = 8, systolicDepth = 8, executionSize = 16, opsPerChan = 2, threadsPerWarp = 16, warpsPerCTA = [4, 2], A = [8, 16], B = [16, 16], C = [8, 16]}>
 module attributes {"triton_gpu.compute-capability" = 2 : i32, "triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
-  tt.func public @matmul_no_scf_with_advance_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f32> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) attributes {noinline = false} {
+  tt.func public @matmul_no_scf_with_advance_kernel(%arg0: !tt.ptr<f16>, %arg1: !tt.ptr<f16>, %arg2: !tt.ptr<f32>, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
     %cst = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #dpas>
     %c32_i32 = arith.constant 32 : i32
     %c-64_i32 = arith.constant -64 : i32

diff --git a/test/TritonIntelGPU/rewrite-tensor-pointer.mlir b/test/TritonIntelGPU/rewrite-tensor-pointer.mlir
@@ -1,7 +1,4 @@
 // RUN: triton-opt %s -tritonintelgpu-rewrite-tensor-pointer | FileCheck %s
-// FIXME
-// XFAIL: *
-
 tt.func public @matmul_kernel(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg2: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg3: i32 {tt.divisibility = 16 : i32}, %arg4: i32 {tt.divisibility = 16 : i32}, %arg5: i32 {tt.divisibility = 16 : i32}, %arg6: i32 {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}, %arg8: i32 {tt.divisibility = 16 : i32}) {
   %c31_i32 = arith.constant 31 : i32
   %c127_i32 = arith.constant 127 : i32

diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -169,8 +169,8 @@ struct LoadOpConversion
     Value ptr = op.getPtr();
     Value mask = op.getMask();
     Value other = op.getOther();
-    Type resultTy = op.getType();
-    RankedTensorType tensorType = resultTy.cast<RankedTensorType>();
+    Type resultType = op.getType();
+    auto tensorType = cast<RankedTensorType>(resultType);
 
     // Only lower loadOp with dpas layout encoding
     Attribute layoutEncoding = tensorType.getEncoding();
@@ -188,7 +188,7 @@ struct LoadOpConversion
     unsigned opIdx = dotLayout.getOpIdx();
     Type eltTy = tensorType.getElementType();
     const ArrayRef<int64_t> tensorShape = tensorType.getShape();
-    unsigned numElems = getTotalElemsPerThread(resultTy);
+    unsigned numElems = getTotalElemsPerThread(resultType);
     SmallVector<int64_t> numReps =
         dpasLayout.getDPASRepetitions(tensorShape, dotLayout.getOpIdx());
     const SmallVector<unsigned> warpsPerCTA = dpasLayout.getWarpsPerCTA();
@@ -201,8 +201,6 @@ struct LoadOpConversion
     SmallVector<Value> multiDimWarpId =
         delinearize(rewriter, loc, warpId, warpsPerCTA, order);
 
-    Type load2DGenXType;
-
     SmallVector<unsigned> operandShape =
         opIdx == 0 ? dpasLayout.getShapeA() : dpasLayout.getShapeB();
     SmallVector<int64_t> elemsPerInstr = {operandShape[0], operandShape[1]};
@@ -219,7 +217,7 @@ struct LoadOpConversion
     } else {
       elemsPerLane = elemsPerLane / opsPerChannel;
     }
-    load2DGenXType = LLVM::getFixedVectorType(elemType, elemsPerLane);
+    Type load2DGenXType = LLVM::getFixedVectorType(elemType, elemsPerLane);
 
     // Outer dim, A is the M, B is the N. Inner dim, the K
     int outerDimWarpNum = std::min<int>(
@@ -239,20 +237,16 @@ struct LoadOpConversion
     SmallVector<Value> rets;
     for (int outer = 0; outer < numRepOuter; ++outer) {
       for (int k = 0; k < numRepK; ++k) {
-        Value offsetX, offsetY;
-        if (opIdx == 0) {
-          // A
-          offsetY =
-              add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
-                  i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]));
-          offsetX = i32_val(k * elemsPerInstr[1]);
-        } else {
-          // B
-          offsetX =
-              add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
-                  i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]));
-          offsetY = i32_val(k * elemsPerInstr[0]);
-        }
+        Value offsetX =
+            (opIdx == 0)
+                ? i32_val(k * elemsPerInstr[1])
+                : add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
+                      i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]));
+        Value offsetY =
+            (opIdx == 0)
+                ? add(mul(outerDimWarpId, i32_val(elemsPerInstr[opIdx])),
+                      i32_val(outer * outerDimWarpNum * elemsPerInstr[opIdx]))
+                : i32_val(k * elemsPerInstr[0]);
         offsetX = add(offsetX, offsetBaseX);
         offsetY = add(offsetY, offsetBaseY);
         width = rewriter.create<arith::TruncIOp>(loc, i32_ty, width);