Skip to content

Commit

Permalink
[GPU] Optimize graph transformation for pytorch (#26410)
Browse files Browse the repository at this point in the history
### Details:
- Transpose fusion into MatMul may have caused perf drop even if tensor
is aligned by 16
   - For small tensors & aligned-by-16, fuse Transpose into MatMul.
   - For large tensors, do not fuse Transpose
 - Remove Pad in front of MaxPool.
   - MaxPool adds padding for CEIL_PYTORCH rounding type.
- The pad should be removed if the pad_begin and pad_end are 0.
Otherwise, it would cause perf drop.

### Tickets:
 - *150556*
  • Loading branch information
jade-cho authored Sep 11, 2024
1 parent efdbad6 commit f026020
Show file tree
Hide file tree
Showing 3 changed files with 92 additions and 6 deletions.
4 changes: 1 addition & 3 deletions src/plugins/intel_gpu/src/plugin/ops/matmul.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -76,8 +76,6 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::
[] (const ov::Dimension& dim) { return dim.is_static() && dim.get_length() % 16 == 0; }) &&
std::all_of(shapes[1].rbegin(), shapes[1].rbegin() + 2,
[] (const ov::Dimension& dim) { return dim.is_static() && dim.get_length() % 16 == 0; });
if (inputsAligned)
return false;

// Heuristic condition for permute and tiled_opt kernel perform better than ref kernel.
bool in0_large = std::all_of(shapes[0].rbegin(), shapes[0].rbegin() + 2,
Expand All @@ -90,7 +88,7 @@ static void CreateMatMulOp(ProgramBuilder& p, const std::shared_ptr<ov::op::v0::
bool in1_very_large = tensor_from_dims(shapes[0].to_shape()).count() > 100000;
bool needs_to_transpose_inputs = (in0_very_large || in1_very_large) && !is_u8_i8 && !p.get_engine().get_device_info().supports_immad;

return (in0_large && in1_large) || needs_to_transpose_inputs;
return !inputsAligned || (in0_large && in1_large) || needs_to_transpose_inputs;
};

auto transposeInput = [] (ProgramBuilder& p, const std::shared_ptr<ov::Node>& op, const ov::PartialShape& shape,
Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,7 @@
#include "plugin/transformations/increase_position_ids_precision.hpp"
#include "plugin/transformations/group_norm_composition.hpp"
#include "plugin/transformations/dynamic_quantize_fully_connected.hpp"
#include "transformations/common_optimizations/nop_elimination.hpp"
#include "transformations/common_optimizations/rms_fusion.hpp"
#include "transformations/common_optimizations/broadcast_elementwise_fusion.hpp"
#include "transformations/common_optimizations/broadcast_transition.hpp"
Expand Down Expand Up @@ -879,6 +880,9 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
manager.register_pass<ov::intel_gpu::DynamicQuantizeFullyConnected>(dynamic_quantization_group_size);
}

// Remove Pad in front of MaxPool if both the pads_begin and pads_end are zero.
manager.register_pass<ov::pass::EliminatePad>();

// This is supposed to be the last pass to ensure that we don't have name collisions until
// GPU plugin stops using friendly names for program creation
manager.register_pass<ov::pass::ResolveNameCollisions>(true);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,93 @@

using namespace ov::test;

namespace ov {
namespace test {

using TransposeMatMulFusionParams = std::tuple<ov::PartialShape, // input A shapes
ov::PartialShape, // input B shapes
bool>; // is transpose fused?

class TransposeMatMulFusionOnGPU: public testing::WithParamInterface<TransposeMatMulFusionParams>,
virtual public ov::test::SubgraphBaseTest {
public:
static std::string getTestCaseName(testing::TestParamInfo<TransposeMatMulFusionParams> obj) {
ov::PartialShape input0;
ov::PartialShape input1;
bool is_fused;

std::tie(input0, input1, is_fused) = obj.param;

std::ostringstream result;
result << "device=(" << std::string(utils::DEVICE_GPU) << ")_";
result << ov::test::utils::partialShape2str({input0}) << "_";
result << ov::test::utils::partialShape2str({input1}) << "_";
result << "is_fused(" << is_fused << ")";
return result.str();
}
protected:
void SetUp() override {
targetDevice = ov::test::utils::DEVICE_GPU;

ov::PartialShape shape1;
ov::PartialShape shape2;
bool is_fused;

std::tie(shape1, shape2, is_fused) = GetParam();

InputShape input_shape1 = {shape1, {shape1.get_shape()}};
InputShape input_shape2 = {shape2, {shape2.get_shape()}};
init_input_shapes({input_shape1, input_shape2});

const auto param1 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, shape1);
const auto param2 = std::make_shared<ov::op::v0::Parameter>(ov::element::f32, shape2);
const auto order = ov::op::v0::Constant::create(ov::element::i32, Shape{4}, {0, 1, 3, 2});
const auto transpose1 = std::make_shared<ov::op::v1::Transpose>(param1, order);
const auto transpose2 = std::make_shared<ov::op::v1::Transpose>(param2, order);
const auto matmul = std::make_shared<ov::op::v0::MatMul>(transpose1, transpose2, false, false);
const auto constant = op::v0::Constant::create(element::f32, Shape{1}, {9});
const auto mul = std::make_shared<ov::op::v1::Multiply>(matmul, constant);
function = std::make_shared<ov::Model>(mul, ov::ParameterVector{param1, param2});
}

void TearDown() override {
bool is_fused;

std::tie(std::ignore, std::ignore, is_fused) = GetParam();

const auto model = compiledModel.get_runtime_model();
int num_ops = 0;
for (const auto& node : model->get_ordered_ops()) {
const auto& rt_info = node->get_rt_info();
const auto layer_type = rt_info.find("layerType")->second.as<std::string>();
if (layer_type != "Reorder" && layer_type != "Const") {
num_ops++;
}
if (is_fused) {
EXPECT_NE(layer_type, "Transpose");
EXPECT_NE(layer_type, "Permute");
}
}
if (is_fused) {
ASSERT_EQ(num_ops, 5); // two Inputs, one Eltwise, one MatMul and one Output
} else {
ASSERT_EQ(num_ops, 7); // two Inputs, two transpose, one Eltwise, one MatMul and one Output
}
}
};

} // namespace test
} // namespace ov

namespace {
INSTANTIATE_TEST_SUITE_P(smoke_TransposeMatMulFusion, TransposeMatMulFusion,
::testing::Values(ov::test::utils::DEVICE_GPU),
TransposeMatMulFusion::getTestCaseName);
INSTANTIATE_TEST_SUITE_P(smoke_TransposeMatMulFusion, TransposeMatMulFusionOnGPU,
::testing::Values(
TransposeMatMulFusionParams({1, 3, 16, 16}, {1, 3, 16, 16}, true),
TransposeMatMulFusionParams({1, 3, 128, 64}, {1, 3, 64, 128}, false)),
TransposeMatMulFusionOnGPU::getTestCaseName);

TEST_P(TransposeMatMulFusionOnGPU, CompareWithRefs){
run();
};

} // namespace

0 comments on commit f026020

Please sign in to comment.