From f0260207dd8524b31f7ea671ae232b9f5e1e4f0e Mon Sep 17 00:00:00 2001
From: Jade Cho <jade.cho@intel.com>
Date: Wed, 11 Sep 2024 15:16:32 +0900
Subject: [PATCH] [GPU] Optimize graph transformation for pytorch (#26410)

### Details:
- Transpose fusion into MatMul may have caused perf drop even if tensor
is aligned by 16
   - For small tensors & aligned-by-16, fuse Transpose into MatMul.
   - For large tensors, do not fuse Transpose
 - Remove Pad in front of MaxPool.
   - MaxPool adds padding for CEIL_PYTORCH rounding type.
- The pad should be removed if the pad_begin and pad_end are 0.
Otherwise, it would cause perf drop.

### Tickets:
 - *150556*
---
 .../intel_gpu/src/plugin/ops/matmul.cpp       |  4 +-
 .../src/plugin/transformations_pipeline.cpp   |  4 +
 .../transpose_matmul_fusion.cpp               | 90 ++++++++++++++++++-
 3 files changed, 92 insertions(+), 6 deletions(-)

diff --git a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
index 9cbbe179173915..e3a19bc7d08556 100644
--- a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
+++ b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
@@ -76,8 +76,6 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::
                                             [] (const ov::Dimension& dim) { return dim.is_static() && dim.get_length() % 16 == 0; }) &&
                                 std::all_of(shapes[1].rbegin(), shapes[1].rbegin() + 2,
                                             [] (const ov::Dimension& dim) { return dim.is_static() && dim.get_length() % 16 == 0; });
-        if (inputsAligned)
-            return false;
 
         // Heuristic condition for permute and tiled_opt kernel perform better than ref kernel.
         bool in0_large = std::all_of(shapes[0].rbegin(), shapes[0].rbegin() + 2,
@@ -90,7 +88,7 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::
         bool in1_very_large = tensor_from_dims(shapes[0].to_shape()).count() > 100000;
         bool needs_to_transpose_inputs = (in0_very_large || in1_very_large) && !is_u8_i8 && !p.get_engine().get_device_info().supports_immad;
 
-        return (in0_large && in1_large) || needs_to_transpose_inputs;
+        return !inputsAligned || (in0_large && in1_large) || needs_to_transpose_inputs;
     };
 
     auto transposeInput = [] (ProgramBuilder& p, const std::shared_ptr<ov::Node>& op, const ov::PartialShape& shape,
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 706689966cd058..a6f0f416a399a6 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -74,6 +74,7 @@
 #include "plugin/transformations/increase_position_ids_precision.hpp"
 #include "plugin/transformations/group_norm_composition.hpp"
 #include "plugin/transformations/dynamic_quantize_fully_connected.hpp"
+#include "transformations/common_optimizations/nop_elimination.hpp"
 #include "transformations/common_optimizations/rms_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp"
 #include "transformations/common_optimizations/broadcast_transition.hpp"
@@ -879,6 +880,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             manager.register_pass<ov::intel_gpu::DynamicQuantizeFullyConnected>(dynamic_quantization_group_size);
         }
 
+        // Remove Pad in front of MaxPool if both the pads_begin and pads_end are zero.
+        manager.register_pass<ov::pass::EliminatePad>();
+
         // This is supposed to be the last pass to ensure that we don't have name collisions until
         // GPU plugin stops using friendly names for program creation
         manager.register_pass<ov::pass::ResolveNameCollisions>(true);
diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp
index 6e95d1e29a15af..b55c9e00bdab64 100644
--- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp
+++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp
@@ -6,9 +6,93 @@
 
 using namespace ov::test;
 
+namespace ov {
+namespace test {
+
+using TransposeMatMulFusionParams = std::tuple<ov::PartialShape,  // input A shapes
+                                        ov::PartialShape,         // input B shapes
+                                        bool>;                    // is transpose fused?
+
+class TransposeMatMulFusionOnGPU: public testing::WithParamInterface<TransposeMatMulFusionParams>,
+                     virtual public ov::test::SubgraphBaseTest {
+public:
+    static std::string getTestCaseName(testing::TestParamInfo<TransposeMatMulFusionParams> obj) {
+        ov::PartialShape input0;
+        ov::PartialShape input1;
+        bool is_fused;
+
+        std::tie(input0, input1, is_fused) = obj.param;
+
+        std::ostringstream result;
+        result << "device=(" << std::string(utils::DEVICE_GPU) << ")_";
+        result << ov::test::utils::partialShape2str({input0}) << "_";
+        result << ov::test::utils::partialShape2str({input1}) << "_";
+        result << "is_fused(" << is_fused << ")";
+        return result.str();
+    }
+protected:
+    void SetUp() override {
+        targetDevice = ov::test::utils::DEVICE_GPU;
+
+        ov::PartialShape shape1;
+        ov::PartialShape shape2;
+        bool is_fused;
+
+        std::tie(shape1, shape2, is_fused) = GetParam();
+
+        InputShape input_shape1 = {shape1, {shape1.get_shape()}};
+        InputShape input_shape2 = {shape2, {shape2.get_shape()}};
+        init_input_shapes({input_shape1, input_shape2});
+
+        const auto param1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, shape1);
+        const auto param2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, shape2);
+        const auto order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {0, 1, 3, 2});
+        const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(param1, order);
+        const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(param2, order);
+        const auto matmul = std::make_shared<ov::op::v0::MatMul>(transpose1, transpose2, false, false);
+        const auto constant = op::v0::Constant::create(element::f32, Shape{1}, {9});
+        const auto mul = std::make_shared<ov::op::v1::Multiply>(matmul, constant);
+        function = std::make_shared<ov::Model>(mul, ov::ParameterVector{param1, param2});
+    }
+
+    void TearDown() override {
+        bool is_fused;
+
+        std::tie(std::ignore, std::ignore, is_fused) = GetParam();
+
+        const auto model = compiledModel.get_runtime_model();
+        int num_ops = 0;
+        for (const auto& node : model->get_ordered_ops()) {
+            const auto& rt_info = node->get_rt_info();
+            const auto layer_type = rt_info.find("layerType")->second.as<std::string>();
+            if (layer_type != "Reorder" && layer_type != "Const") {
+                num_ops++;
+            }
+            if (is_fused) {
+                EXPECT_NE(layer_type, "Transpose");
+                EXPECT_NE(layer_type, "Permute");
+            }
+        }
+        if (is_fused) {
+            ASSERT_EQ(num_ops, 5); // two Inputs, one Eltwise, one MatMul and one Output
+        } else {
+            ASSERT_EQ(num_ops, 7); // two Inputs, two transpose, one Eltwise, one MatMul and one Output
+        }
+     }
+};
+
+}  // namespace test
+}  // namespace ov
+
 namespace {
-INSTANTIATE_TEST_SUITE_P(smoke_TransposeMatMulFusion, TransposeMatMulFusion,
-                         ::testing::Values(ov::test::utils::DEVICE_GPU),
-                         TransposeMatMulFusion::getTestCaseName);
+INSTANTIATE_TEST_SUITE_P(smoke_TransposeMatMulFusion, TransposeMatMulFusionOnGPU,
+                         ::testing::Values(
+                            TransposeMatMulFusionParams({1, 3, 16, 16}, {1, 3, 16, 16}, true),
+                            TransposeMatMulFusionParams({1, 3, 128, 64}, {1, 3, 64, 128}, false)),
+                         TransposeMatMulFusionOnGPU::getTestCaseName);
+
+TEST_P(TransposeMatMulFusionOnGPU, CompareWithRefs){
+    run();
+};
 
 }  // namespace