[GPU] Avoid optimizing out crop for faster gemm kernel selection. (op…

…envinotoolkit#26556) ### Details: - Disable crop optimization when inner axis padding leads to GEMM ref kernel selection due to lack of support in optimized kernels. ### Tickets: - *150556*
t-jankowski · Sep 26, 2024 · c20059a · c20059a
1 parent c864266
commit c20059a
Show file tree

Hide file tree

Showing 2 changed files with 41 additions and 0 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp
@@ -478,6 +478,17 @@ bool crop_in_place_optimization::match(const program_node& node,
         // TODO: Need to allow optimization for gemm user
         if (node.is_dynamic() && (user->is_type<convolution>() || user->is_type<gemm>()))
             return false;
+        // For static shape, gemm ref kernel is selected if there is padding on the feature, x, or y axes.
+        // In such cases, do not optimize out this crop to use the opt kernel.
+        // TODO: Modify gemm_tiled_opt kernel to support padding even in static shape.
+        if ((!node.is_dynamic() || is_runtime) && user->is_type<gemm>() &&
+            (user->get_dependency_index(node) == 0 || user->get_dependency_index(node) == 1)) {
+            if (crop_params.input_offsets[0].feature[0] != 0 ||
+                crop_params.input_offsets[0].spatial[0] != 0 ||
+                crop_params.input_offsets[0].spatial[1] != 0) {
+                return false;
+            }
+        }
         if (user->is_type<reshape>()) {
             // runtime buffer fusing is only handled when there is only one reshape user
             if (node.is_dynamic() && node.get_users().size() != 1)

diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp
@@ -1464,3 +1464,33 @@ TEST(prepare_buffer_fusing, in_place_onednn_concat_static) {
     }
 }
 #endif  // ENABLE_ONEDNN_FOR_GPU
+
+TEST(prepare_buffer_fusing, inner_axis_data_offset_with_gemm_user) {
+    tests::random_generator rg(GET_SUITE_NAME);
+
+    auto& engine = get_test_engine();
+
+    auto in_layout = layout{ ov::PartialShape{1, 6, 16, 16}, data_types::f16, format::bfyx };
+    auto crop_layout = layout{ ov::PartialShape{1, 6, 8, 16}, data_types::f16, format::bfyx };
+
+    auto input_memory = engine.allocate_memory(in_layout);
+    auto input_data = rg.generate_random_1d<float>(input_memory->count(), -1, 1);
+
+    auto offsets1 = tensor{0, 0, 0, 0};
+    auto offsets2 = tensor{0, 0, 8, 0};
+
+    topology topology;
+    topology.add(input_layout("input", in_layout));
+    topology.add(crop("crop1", input_info("input"), crop_layout.get_tensor(), offsets1));
+    topology.add(permute("permute", input_info("crop1"), {0, 1, 3, 2}));
+    topology.add(crop("crop2", input_info("input"), crop_layout.get_tensor(), offsets2));
+    topology.add(gemm("gemm", {input_info("permute"), input_info("crop2")}, data_types::f16, false, false));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::optimize_data(true));
+    auto prog = program::build_program(engine, topology, config, false, false);
+    ASSERT_NE(prog, nullptr);
+
+    auto& crop_node = prog->get_node("crop2").as<crop>();
+    ASSERT_FALSE(crop_node.can_be_optimized());
+}