From f0260207dd8524b31f7ea671ae232b9f5e1e4f0e Mon Sep 17 00:00:00 2001 From: Jade Cho Date: Wed, 11 Sep 2024 15:16:32 +0900 Subject: [PATCH] [GPU] Optimize graph transformation for pytorch (#26410) ### Details: - Transpose fusion into MatMul may have caused perf drop even if tensor is aligned by 16 - For small tensors & aligned-by-16, fuse Transpose into MatMul. - For large tensors, do not fuse Transpose - Remove Pad in front of MaxPool. - MaxPool adds padding for CEIL_PYTORCH rounding type. - The pad should be removed if the pad_begin and pad_end are 0. Otherwise, it would cause perf drop. ### Tickets: - *150556* --- .../intel_gpu/src/plugin/ops/matmul.cpp | 4 +- .../src/plugin/transformations_pipeline.cpp | 4 + .../transpose_matmul_fusion.cpp | 90 ++++++++++++++++++- 3 files changed, 92 insertions(+), 6 deletions(-) diff --git a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp index 9cbbe179173915..e3a19bc7d08556 100644 --- a/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp +++ b/src/plugins/intel_gpu/src/plugin/ops/matmul.cpp @@ -76,8 +76,6 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr 100000; bool needs_to_transpose_inputs = (in0_very_large || in1_very_large) && !is_u8_i8 && !p.get_engine().get_device_info().supports_immad; - return (in0_large && in1_large) || needs_to_transpose_inputs; + return !inputsAligned || (in0_large && in1_large) || needs_to_transpose_inputs; }; auto transposeInput = [] (ProgramBuilder& p, const std::shared_ptr& op, const ov::PartialShape& shape, diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp index 706689966cd058..a6f0f416a399a6 100644 --- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp +++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp @@ -74,6 +74,7 @@ #include "plugin/transformations/increase_position_ids_precision.hpp" #include "plugin/transformations/group_norm_composition.hpp" #include "plugin/transformations/dynamic_quantize_fully_connected.hpp" +#include "transformations/common_optimizations/nop_elimination.hpp" #include "transformations/common_optimizations/rms_fusion.hpp" #include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp" #include "transformations/common_optimizations/broadcast_transition.hpp" @@ -879,6 +880,9 @@ void TransformationsPipeline::apply(std::shared_ptr func) { manager.register_pass(dynamic_quantization_group_size); } + // Remove Pad in front of MaxPool if both the pads_begin and pads_end are zero. + manager.register_pass(); + // This is supposed to be the last pass to ensure that we don't have name collisions until // GPU plugin stops using friendly names for program creation manager.register_pass(true); diff --git a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp index 6e95d1e29a15af..b55c9e00bdab64 100644 --- a/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp +++ b/src/plugins/intel_gpu/tests/functional/shared_tests_instances/subgraph_tests/transpose_matmul_fusion.cpp @@ -6,9 +6,93 @@ using namespace ov::test; +namespace ov { +namespace test { + +using TransposeMatMulFusionParams = std::tuple; // is transpose fused? + +class TransposeMatMulFusionOnGPU: public testing::WithParamInterface, + virtual public ov::test::SubgraphBaseTest { +public: + static std::string getTestCaseName(testing::TestParamInfo obj) { + ov::PartialShape input0; + ov::PartialShape input1; + bool is_fused; + + std::tie(input0, input1, is_fused) = obj.param; + + std::ostringstream result; + result << "device=(" << std::string(utils::DEVICE_GPU) << ")_"; + result << ov::test::utils::partialShape2str({input0}) << "_"; + result << ov::test::utils::partialShape2str({input1}) << "_"; + result << "is_fused(" << is_fused << ")"; + return result.str(); + } +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_GPU; + + ov::PartialShape shape1; + ov::PartialShape shape2; + bool is_fused; + + std::tie(shape1, shape2, is_fused) = GetParam(); + + InputShape input_shape1 = {shape1, {shape1.get_shape()}}; + InputShape input_shape2 = {shape2, {shape2.get_shape()}}; + init_input_shapes({input_shape1, input_shape2}); + + const auto param1 = std::make_shared(ov::element::f32, shape1); + const auto param2 = std::make_shared(ov::element::f32, shape2); + const auto order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {0, 1, 3, 2}); + const auto transpose1 = std::make_shared(param1, order); + const auto transpose2 = std::make_shared(param2, order); + const auto matmul = std::make_shared(transpose1, transpose2, false, false); + const auto constant = op::v0::Constant::create(element::f32, Shape{1}, {9}); + const auto mul = std::make_shared(matmul, constant); + function = std::make_shared(mul, ov::ParameterVector{param1, param2}); + } + + void TearDown() override { + bool is_fused; + + std::tie(std::ignore, std::ignore, is_fused) = GetParam(); + + const auto model = compiledModel.get_runtime_model(); + int num_ops = 0; + for (const auto& node : model->get_ordered_ops()) { + const auto& rt_info = node->get_rt_info(); + const auto layer_type = rt_info.find("layerType")->second.as(); + if (layer_type != "Reorder" && layer_type != "Const") { + num_ops++; + } + if (is_fused) { + EXPECT_NE(layer_type, "Transpose"); + EXPECT_NE(layer_type, "Permute"); + } + } + if (is_fused) { + ASSERT_EQ(num_ops, 5); // two Inputs, one Eltwise, one MatMul and one Output + } else { + ASSERT_EQ(num_ops, 7); // two Inputs, two transpose, one Eltwise, one MatMul and one Output + } + } +}; + +} // namespace test +} // namespace ov + namespace { -INSTANTIATE_TEST_SUITE_P(smoke_TransposeMatMulFusion, TransposeMatMulFusion, - ::testing::Values(ov::test::utils::DEVICE_GPU), - TransposeMatMulFusion::getTestCaseName); +INSTANTIATE_TEST_SUITE_P(smoke_TransposeMatMulFusion, TransposeMatMulFusionOnGPU, + ::testing::Values( + TransposeMatMulFusionParams({1, 3, 16, 16}, {1, 3, 16, 16}, true), + TransposeMatMulFusionParams({1, 3, 128, 64}, {1, 3, 64, 128}, false)), + TransposeMatMulFusionOnGPU::getTestCaseName); + +TEST_P(TransposeMatMulFusionOnGPU, CompareWithRefs){ + run(); +}; } // namespace