[GPU] Update gpu passes to keep valid layouts more often (#26426)

### Details: - Added explicit layout recalc after some graph modifications to avoid full invalidation of layouts later on random attempt to call `get_output_layout()` - Remove i64 data type handling in add_required_reorder pass as ConvertPrecision is supposed to convert everything into i32. - Move shape info tensor alloc to `update_shape_info_tensor` call to avoid allocation in case if the buffer is not needed for selected impl - Fixed impl forcing in some unit tests + other minor test changes
openvinotoolkit · Sep 9, 2024 · 94a9675 · 94a9675
1 parent 556a28c
commit 94a9675
Show file tree

Hide file tree

Showing 26 changed files with 181 additions and 190 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/add_required_reorders.cpp
@@ -5,12 +5,10 @@
 
 #include "pass_manager.h"
 #include "program_node.h"
-#include "mutable_data_inst.h"
 #include "convert_color_inst.h"
 #include "fully_connected_inst.h"
 #include "assign_inst.h"
 #include "mvn_inst.h"
-#include "tensor_type.h"
 
 #include <algorithm>
 #include <memory>
@@ -64,6 +62,10 @@ void add_required_reorders::run(program& p) {
         if (usr->is_type<data>())
             continue;
 
+        if (!usr->is_all_valid_output_layouts()) {
+            usr->recalc_output_layouts(false);
+        }
+
         // If usr is assign and input and output data types are different
         // add reorder with usr's output data type between dep and usr
         if (usr->is_type<assign>()) {
@@ -75,7 +77,7 @@ void add_required_reorders::run(program& p) {
                 auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
                 auto& new_reorder_node = p.get_or_create(new_reorder);
                 p.add_intermediate(new_reorder_node, *usr, dep);
-                new_reorder_node.recalc_output_layout(false);
+                new_reorder_node.recalc_output_layouts(false);
             }
         }
 
@@ -92,7 +94,7 @@ void add_required_reorders::run(program& p) {
                     auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
                     auto& new_reorder_node = p.get_or_create(new_reorder);
                     p.add_intermediate(new_reorder_node, *usr, dep);
-                    new_reorder_node.recalc_output_layout(false);
+                    new_reorder_node.recalc_output_layouts(false);
                 }
             }
         }
@@ -193,7 +195,7 @@ void add_required_reorders::run(program& p) {
                             auto new_reorder = std::make_shared<reorder>(input.id() + "_padding_reorder_" + usr->id(), input.id(), layout_wo_padding);
                             auto& new_reorder_node = p.get_or_create(new_reorder);
                             p.add_intermediate(new_reorder_node, *usr, idx);
-                            new_reorder_node.recalc_output_layout(false);
+                            new_reorder_node.recalc_output_layouts(false);
                         } else {
                             continue;
                         }
@@ -222,42 +224,6 @@ void add_required_reorders::run(program& p) {
                     if (usr->type()->does_possible_implementation_exist(*usr)) {
                         correct_layout_selected = true;
                         break;
-                    } else if (original_layout.data_type == data_types::i64) {
-                        // goal of this section is to use int32 implementation
-                        // if int64 is not available for usr primitive
-                        current_layout = original_layout;
-                        current_layout.data_type = data_types::i32;
-                        usr->set_output_layout(current_layout, false);
-                        if (usr->type()->does_possible_implementation_exist(*usr)) {
-                            correct_layout_selected = true;
-                        } else {
-                            current_layout = original_layout;
-                            current_layout.data_type = data_types::i32;
-                            current_layout.format = node.first->get_output_layout().format;
-                            usr->set_output_layout(current_layout, false);
-                            if (usr->type()->does_possible_implementation_exist(*usr)) {
-                                correct_layout_selected = true;
-                            }
-                        }
-
-                        if (correct_layout_selected) {
-                            // change output_data_type field in usr to i32
-                            if ((static_cast<bool>(usr->get_primitive()->output_data_types[0]) == true) &&
-                                (*(usr->get_primitive()->output_data_types[0]) == data_types::i64)) {
-                                std::const_pointer_cast<primitive>(usr->get_primitive())->output_data_types[0] = data_types::i32;
-                            }
-                            // add reorders between usr int32 output and inputs of its users
-                            auto next_usr_itr = usr->get_users().begin();
-                            while (next_usr_itr != usr->get_users().end()) {
-                                auto next_usr = *next_usr_itr++;
-                                if (!next_usr->is_type<reorder>()) {
-                                    if ((next_usr->get_output_layout() != usr->get_output_layout())) {
-                                        add_reorder(p, usr, next_usr);
-                                    }
-                                }
-                            }
-                            break;
-                        }
                     }
                 }
 
@@ -310,54 +276,6 @@ void add_required_reorders::run(program& p) {
                     }
                 }
             }
-
-            if (!correct_layout_selected) {
-                // goal of this section is to use int32 implementation
-                // if int64 is not available for usr primitive
-                if (original_layout.data_type == data_types::i64) {
-                    layout original_layout_i32(original_layout.get_partial_shape(),
-                                               data_types::i32,
-                                               original_layout.format);
-                    usr->set_output_layout(original_layout_i32, false);
-                    if (usr->type()->does_possible_implementation_exist(*usr)) {
-                        correct_layout_selected = true;
-                    }
-
-                    if (!correct_layout_selected) {
-                        for (auto new_layout_format : preferred_layout_formats) {
-                            layout current_layout_i32(original_layout_i32.get_partial_shape(),
-                                                      original_layout_i32.data_type,
-                                                      new_layout_format);
-                            usr->set_output_layout(current_layout_i32, false);
-                            if (usr->type()->does_possible_implementation_exist(*usr)) {
-                                correct_layout_selected = true;
-                                break;
-                            }
-                        }
-                    }
-                    if (!correct_layout_selected) {
-                        throw std::runtime_error("Internal Error: no implementation for " + usr->id() +
-                            " kernel which satisfies output format dependecies.");
-                    }
-
-                    // change output_data_type field in usr to i32
-                    if ((static_cast<bool>(usr->get_primitive()->output_data_types[0]) == true) &&
-                        (*(usr->get_primitive()->output_data_types[0]) == data_types::i64)) {
-                        std::const_pointer_cast<primitive>(usr->get_primitive())->output_data_types[0] = data_types::i32;
-                    }
-
-                    // add reorders between usr int32 output and inputs of its users
-                    auto next_usr_itr = usr->get_users().begin();
-                    while (next_usr_itr != usr->get_users().end()) {
-                        auto next_usr = *next_usr_itr++;
-                        if (!next_usr->is_type<reorder>()) {
-                            if ((next_usr->get_output_layout() != usr->get_output_layout())) {
-                                add_reorder(p, usr, next_usr);
-                            }
-                        }
-                    }
-                }
-            }
         }
 
         // layout is selected now add required reorders

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/handle_reshape.cpp
@@ -181,6 +181,8 @@ void handle_reshape::run(program& p) {
                         }
 
                         reorder_reshape_nodes.push_back(&new_reshape_node);
+                        new_reshape_node.recalc_output_layouts(false);
+                        node->recalc_output_layouts(false);
                     }
                 }
 
@@ -208,7 +210,8 @@ void handle_reshape::run(program& p) {
                                        0,
                                        reshape_input_node.get_dependencies().empty());
                     reshape_reorder_id++;
-                    reshape_input_node.recalc_output_layout();
+                    reshape_input_node.recalc_output_layouts(false);
+                    node->recalc_output_layouts(false);
                 }
             }
 
@@ -233,7 +236,8 @@ void handle_reshape::run(program& p) {
                         << " input_info : " << reshape_input->dependencies().front().to_string() << std::endl;
                     auto& reshape_input_node = p.get_or_create(reshape_input);
                     p.add_intermediate(reshape_input_node, *node, 0, reshape_input_node.get_dependencies().empty());
-                    reshape_input_node.recalc_output_layout();
+                    reshape_input_node.recalc_output_layouts(false);
+                    node->recalc_output_layouts(false);
                 }
 
                 // Check whether output reorder is required for format change
@@ -251,9 +255,9 @@ void handle_reshape::run(program& p) {
                                         *user,
                                         *node,
                                         reshape_output_node.get_dependencies().empty());
-                        reshape_output_node.recalc_output_layout();
+                        reshape_output_node.recalc_output_layouts(false);
                     }
-                    node->recalc_output_layout();
+                    node->recalc_output_layouts(false);
                 }
             }
         }

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_padding.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_padding.cpp
@@ -41,6 +41,7 @@ void prepare_padding::run(program& p) {
                     auto new_reorder = std::make_shared<reorder>(node.id() + "_padding_reorder_for_" + input.id(), input.id(), input.get_output_layout());
                     auto& new_reorder_node = p.get_or_create(new_reorder);
                     p.add_intermediate(new_reorder_node, node, input);
+                    new_reorder_node.recalc_output_layouts(false);
                 }
 
                 p.apply_needed_padding(node, node.get_dependency(0), needed_padding);
@@ -209,6 +210,7 @@ void prepare_padding::run(program& p) {
             auto new_reorder = std::make_shared<reorder>(node.id() + "_padding_reorder_for_" + input.id(), input.id(), input.get_output_layout());
             auto& new_reorder_node = p.get_or_create(new_reorder);
             p.add_intermediate(new_reorder_node, node, input);
+            new_reorder_node.recalc_output_layouts(false);
         }
 
         p.apply_needed_padding(node, node.get_dependency(0), needed_padding);

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp
@@ -31,6 +31,19 @@ using namespace cldnn;
 #define LOG_NODE_REMOVAL(id)      GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__  << ": remove node: " << (id) << std::endl;
 #define LOG_NODE_REPLACEMENT(id)  GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__  << ": replace node: " << (id) << std::endl;
 
+namespace {
+
+bool does_any_user_have_impl_type(program_node& node, impl_types impl) {
+    for (auto& user : node.get_users()) {
+        if (user->get_preferred_impl_type() == impl)
+            return true;
+    }
+
+    return false;
+}
+
+}  // namespace
+
 remove_redundant_reorders::remove_redundant_reorders(bool enable_reorder_fusing, bool update_implementations,
     bool remove_output_reorders)
     : base_pass("remove_redundant_reorders"), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations),
@@ -290,7 +303,7 @@ void remove_redundant_reorders::run(program& p) {
             i_layout.data_padding._upper_size[3] == 0 && i_layout.data_padding._lower_size[3] == 0 &&
             !o_layout.data_padding &&
             i_layout.data_type == o_layout.data_type &&
-            !layout_optimizer::onednn_check_preferred_impl_type_of_users(r_node)) {
+            !does_any_user_have_impl_type(r_node, impl_types::onednn)) {
             // If the newly aligned pad is merged into output layout during post_optimize_graph phase
             // and then buffer is reinterpreted, user node cannot handle pad properly for kernel execution
             if (!update_implementations || (i_layout.feature() % 16 == 0 &&

diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/reorder_inputs.cpp
@@ -756,6 +756,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
 
                 if (new_input.first) {
                     p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
+                    detection_output_node.recalc_output_layouts();
                 }
             }
         }
@@ -770,6 +771,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
                 layout{ input_layout.get_partial_shape(), input_layout.data_type, new_format });
             if (reorder.first) {
                 p.add_intermediate(reorder.first, deconv_node, 0, !reorder.second);
+                deconv_node.recalc_output_layouts();
             }
         }
 
@@ -893,6 +895,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
             auto new_input = rf.get_reorder(input.id(), input_layout, new_layout);
             if (new_input.first) {
                p.add_intermediate(new_input.first, fc_node, 0, !new_input.second);
+               fc_node.recalc_output_layouts();
             }
         }
 
@@ -919,6 +922,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
             auto new_input = rf.get_reorder(input->id(), dep.second, input_layout, new_layout);
             if (new_input.first) {
                p.add_intermediate(new_input.first, pooling_node, 0);
+               pooling_node.recalc_output_layouts();
             }
         }
     };

diff --git a/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h b/src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
@@ -186,7 +186,6 @@ class layout_optimizer {
     static bool onednn_check_data_types_for_convolution(data_types in_dt, data_types wei_dt, data_types out_dt);
     static bool onednn_check_data_types_for_deconvolution(data_types in_dt, data_types wei_dt, data_types out_dt);
     static bool onednn_check_data_types_for_fc_gemm(data_types in_dt, data_types wei_dt, data_types out_dt);
-    static bool onednn_check_preferred_impl_type_of_users(program_node& node);
     bool is_primitive_implemented_for_onednn(program_node& node);
     bool is_format_supported(program_node& node, format::type fmt);
 

diff --git a/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h b/src/plugins/intel_gpu/src/graph/include/paged_attention_inst.h
@@ -66,8 +66,6 @@ class typed_primitive_inst<paged_attention> : public typed_primitive_inst_base<p
 
 protected:
     void on_execute() override;
-
-    void update_shape_info_tensor(const kernel_impl_params& params) override;
 };
 
 using paged_attention_inst = typed_primitive_inst<paged_attention>;

diff --git a/src/plugins/intel_gpu/src/graph/include/primitive_inst.h b/src/plugins/intel_gpu/src/graph/include/primitive_inst.h
@@ -386,6 +386,7 @@ class primitive_inst {
                                               bool reset_mem = true,
                                               bool runtime_alloc = false);
     memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
+    void allocate_shape_info_memory();
     static std::vector<primitive_inst*> build_exec_deps(
         std::vector<std::pair<primitive_inst*, int32_t>> const& mem_deps);
     int32_t get_index_in_deps(memory::cptr arg) const;

diff --git a/src/plugins/intel_gpu/src/graph/kv_cache.cpp b/src/plugins/intel_gpu/src/graph/kv_cache.cpp
@@ -83,6 +83,9 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
 }
 
 void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
+    if (!_shape_info_memory) {
+        allocate_shape_info_memory();
+    }
     mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
     auto shape_info_ptr = lock.data();
     size_t offset = 0;

diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1434,18 +1434,6 @@ bool layout_optimizer::is_primitive_implemented_for_onednn(program_node& node) {
     return false;
 }
 
-bool layout_optimizer::onednn_check_preferred_impl_type_of_users(program_node& node) {
-    if (node.get_users().size() == 0)
-        return false;
-
-    for (auto& user : node.get_users()) {
-        if (user->get_preferred_impl_type() == impl_types::onednn)
-            return true;
-    }
-
-    return false;
-}
-
 impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) {
 #ifdef GPU_DEBUG_CONFIG
     GPU_DEBUG_GET_INSTANCE(debug_config);

diff --git a/src/plugins/intel_gpu/src/graph/paged_attention.cpp b/src/plugins/intel_gpu/src/graph/paged_attention.cpp
@@ -143,10 +143,6 @@ void paged_attention_inst::on_execute() {
     }
 }
 
-void paged_attention_inst::update_shape_info_tensor(const kernel_impl_params& params) {
-    parent::update_shape_info_tensor(params);
-}
-
 paged_attention_inst::typed_primitive_inst(network& network, const paged_attention_node& node)
     : parent(network, node) {
     const auto desc = node.get_primitive();

diff --git a/src/plugins/intel_gpu/src/graph/primitive_inst.cpp b/src/plugins/intel_gpu/src/graph/primitive_inst.cpp
@@ -945,7 +945,16 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
     }
 }
 
+void primitive_inst::allocate_shape_info_memory() {
+    int64_t shape_elements = _node->get_total_shape_info_size();
+    _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);
+}
+
 void primitive_inst::update_shape_info_tensor(const kernel_impl_params& params) {
+    if (!_shape_info_memory) {
+        allocate_shape_info_memory();
+    }
+
     mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
     auto shape_info_ptr = lock.data();
     size_t offset = 0;
@@ -1858,8 +1867,6 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
         if (_impl->is_dynamic() && !_impl->is_cpu()) {
             GPU_DEBUG_TRACE_DETAIL << id() << ": initialize impl with dynamic impl " << _impl->get_kernel_name() << std::endl;
             _dynamic_impl = _impl->clone();
-            const int64_t shape_elements = node.get_total_shape_info_size();
-            _shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx});
         }
     }
     _impl_params->strm = _network.get_stream_ptr();

diff --git a/src/plugins/intel_gpu/src/graph/program.cpp b/src/plugins/intel_gpu/src/graph/program.cpp
@@ -813,6 +813,7 @@ void program::apply_needed_padding(program_node& node, program_node& prev_node,
 
         auto r_prim = std::make_shared<reorder>("reorder_input_" + node.id(), prev_node.id(), target_layout);
         add_intermediate(r_prim, node, 0);
+        get_or_create(r_prim).recalc_output_layouts(false);
         return;
     }
 

diff --git a/src/plugins/intel_gpu/src/graph/slice.cpp b/src/plugins/intel_gpu/src/graph/slice.cpp
@@ -96,6 +96,10 @@ std::string slice_inst::to_string(slice_node const& node) {
 }
 
 void slice_inst::update_shape_info_tensor(const kernel_impl_params& params) {
+    if (!_shape_info_memory) {
+        allocate_shape_info_memory();
+    }
+
     mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
     auto shape_info_ptr = lock.data();
     size_t offset = 0;