Skip to content

Commit

Permalink
[GPU] Update gpu passes to keep valid layouts more often (#26426)
Browse files Browse the repository at this point in the history
### Details:
- Added explicit layout recalc after some graph modifications to avoid
full invalidation of layouts later on random attempt to call
`get_output_layout()`
- Remove i64 data type handling in add_required_reorder pass as
ConvertPrecision is supposed to convert everything into i32.
- Move shape info tensor alloc to `update_shape_info_tensor` call to
avoid allocation in case if the buffer is not needed for selected impl
- Fixed impl forcing in some unit tests  + other minor test changes
  • Loading branch information
vladimir-paramuzov authored Sep 9, 2024
1 parent 556a28c commit 94a9675
Show file tree
Hide file tree
Showing 26 changed files with 181 additions and 190 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,10 @@

#include "pass_manager.h"
#include "program_node.h"
#include "mutable_data_inst.h"
#include "convert_color_inst.h"
#include "fully_connected_inst.h"
#include "assign_inst.h"
#include "mvn_inst.h"
#include "tensor_type.h"

#include <algorithm>
#include <memory>
Expand Down Expand Up @@ -64,6 +62,10 @@ void add_required_reorders::run(program& p) {
if (usr->is_type<data>())
continue;

if (!usr->is_all_valid_output_layouts()) {
usr->recalc_output_layouts(false);
}

// If usr is assign and input and output data types are different
// add reorder with usr's output data type between dep and usr
if (usr->is_type<assign>()) {
Expand All @@ -75,7 +77,7 @@ void add_required_reorders::run(program& p) {
auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, dep);
new_reorder_node.recalc_output_layout(false);
new_reorder_node.recalc_output_layouts(false);
}
}

Expand All @@ -92,7 +94,7 @@ void add_required_reorders::run(program& p) {
auto new_reorder = std::make_shared<reorder>(dep.id() + "_reorder_" + usr->id(), dep.id(), out_layout.format, out_layout.data_type);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, dep);
new_reorder_node.recalc_output_layout(false);
new_reorder_node.recalc_output_layouts(false);
}
}
}
Expand Down Expand Up @@ -193,7 +195,7 @@ void add_required_reorders::run(program& p) {
auto new_reorder = std::make_shared<reorder>(input.id() + "_padding_reorder_" + usr->id(), input.id(), layout_wo_padding);
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, *usr, idx);
new_reorder_node.recalc_output_layout(false);
new_reorder_node.recalc_output_layouts(false);
} else {
continue;
}
Expand Down Expand Up @@ -222,42 +224,6 @@ void add_required_reorders::run(program& p) {
if (usr->type()->does_possible_implementation_exist(*usr)) {
correct_layout_selected = true;
break;
} else if (original_layout.data_type == data_types::i64) {
// goal of this section is to use int32 implementation
// if int64 is not available for usr primitive
current_layout = original_layout;
current_layout.data_type = data_types::i32;
usr->set_output_layout(current_layout, false);
if (usr->type()->does_possible_implementation_exist(*usr)) {
correct_layout_selected = true;
} else {
current_layout = original_layout;
current_layout.data_type = data_types::i32;
current_layout.format = node.first->get_output_layout().format;
usr->set_output_layout(current_layout, false);
if (usr->type()->does_possible_implementation_exist(*usr)) {
correct_layout_selected = true;
}
}

if (correct_layout_selected) {
// change output_data_type field in usr to i32
if ((static_cast<bool>(usr->get_primitive()->output_data_types[0]) == true) &&
(*(usr->get_primitive()->output_data_types[0]) == data_types::i64)) {
std::const_pointer_cast<primitive>(usr->get_primitive())->output_data_types[0] = data_types::i32;
}
// add reorders between usr int32 output and inputs of its users
auto next_usr_itr = usr->get_users().begin();
while (next_usr_itr != usr->get_users().end()) {
auto next_usr = *next_usr_itr++;
if (!next_usr->is_type<reorder>()) {
if ((next_usr->get_output_layout() != usr->get_output_layout())) {
add_reorder(p, usr, next_usr);
}
}
}
break;
}
}
}

Expand Down Expand Up @@ -310,54 +276,6 @@ void add_required_reorders::run(program& p) {
}
}
}

if (!correct_layout_selected) {
// goal of this section is to use int32 implementation
// if int64 is not available for usr primitive
if (original_layout.data_type == data_types::i64) {
layout original_layout_i32(original_layout.get_partial_shape(),
data_types::i32,
original_layout.format);
usr->set_output_layout(original_layout_i32, false);
if (usr->type()->does_possible_implementation_exist(*usr)) {
correct_layout_selected = true;
}

if (!correct_layout_selected) {
for (auto new_layout_format : preferred_layout_formats) {
layout current_layout_i32(original_layout_i32.get_partial_shape(),
original_layout_i32.data_type,
new_layout_format);
usr->set_output_layout(current_layout_i32, false);
if (usr->type()->does_possible_implementation_exist(*usr)) {
correct_layout_selected = true;
break;
}
}
}
if (!correct_layout_selected) {
throw std::runtime_error("Internal Error: no implementation for " + usr->id() +
" kernel which satisfies output format dependecies.");
}

// change output_data_type field in usr to i32
if ((static_cast<bool>(usr->get_primitive()->output_data_types[0]) == true) &&
(*(usr->get_primitive()->output_data_types[0]) == data_types::i64)) {
std::const_pointer_cast<primitive>(usr->get_primitive())->output_data_types[0] = data_types::i32;
}

// add reorders between usr int32 output and inputs of its users
auto next_usr_itr = usr->get_users().begin();
while (next_usr_itr != usr->get_users().end()) {
auto next_usr = *next_usr_itr++;
if (!next_usr->is_type<reorder>()) {
if ((next_usr->get_output_layout() != usr->get_output_layout())) {
add_reorder(p, usr, next_usr);
}
}
}
}
}
}

// layout is selected now add required reorders
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,8 @@ void handle_reshape::run(program& p) {
}

reorder_reshape_nodes.push_back(&new_reshape_node);
new_reshape_node.recalc_output_layouts(false);
node->recalc_output_layouts(false);
}
}

Expand Down Expand Up @@ -208,7 +210,8 @@ void handle_reshape::run(program& p) {
0,
reshape_input_node.get_dependencies().empty());
reshape_reorder_id++;
reshape_input_node.recalc_output_layout();
reshape_input_node.recalc_output_layouts(false);
node->recalc_output_layouts(false);
}
}

Expand All @@ -233,7 +236,8 @@ void handle_reshape::run(program& p) {
<< " input_info : " << reshape_input->dependencies().front().to_string() << std::endl;
auto& reshape_input_node = p.get_or_create(reshape_input);
p.add_intermediate(reshape_input_node, *node, 0, reshape_input_node.get_dependencies().empty());
reshape_input_node.recalc_output_layout();
reshape_input_node.recalc_output_layouts(false);
node->recalc_output_layouts(false);
}

// Check whether output reorder is required for format change
Expand All @@ -251,9 +255,9 @@ void handle_reshape::run(program& p) {
*user,
*node,
reshape_output_node.get_dependencies().empty());
reshape_output_node.recalc_output_layout();
reshape_output_node.recalc_output_layouts(false);
}
node->recalc_output_layout();
node->recalc_output_layouts(false);
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ void prepare_padding::run(program& p) {
auto new_reorder = std::make_shared<reorder>(node.id() + "_padding_reorder_for_" + input.id(), input.id(), input.get_output_layout());
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, node, input);
new_reorder_node.recalc_output_layouts(false);
}

p.apply_needed_padding(node, node.get_dependency(0), needed_padding);
Expand Down Expand Up @@ -209,6 +210,7 @@ void prepare_padding::run(program& p) {
auto new_reorder = std::make_shared<reorder>(node.id() + "_padding_reorder_for_" + input.id(), input.id(), input.get_output_layout());
auto& new_reorder_node = p.get_or_create(new_reorder);
p.add_intermediate(new_reorder_node, node, input);
new_reorder_node.recalc_output_layouts(false);
}

p.apply_needed_padding(node, node.get_dependency(0), needed_padding);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,19 @@ using namespace cldnn;
#define LOG_NODE_REMOVAL(id) GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__ << ": remove node: " << (id) << std::endl;
#define LOG_NODE_REPLACEMENT(id) GPU_DEBUG_LOG_PASS << __func__ << ":" << __LINE__ << ": replace node: " << (id) << std::endl;

namespace {

bool does_any_user_have_impl_type(program_node& node, impl_types impl) {
for (auto& user : node.get_users()) {
if (user->get_preferred_impl_type() == impl)
return true;
}

return false;
}

} // namespace

remove_redundant_reorders::remove_redundant_reorders(bool enable_reorder_fusing, bool update_implementations,
bool remove_output_reorders)
: base_pass("remove_redundant_reorders"), enable_reorder_fusing(enable_reorder_fusing), update_implementations(update_implementations),
Expand Down Expand Up @@ -290,7 +303,7 @@ void remove_redundant_reorders::run(program& p) {
i_layout.data_padding._upper_size[3] == 0 && i_layout.data_padding._lower_size[3] == 0 &&
!o_layout.data_padding &&
i_layout.data_type == o_layout.data_type &&
!layout_optimizer::onednn_check_preferred_impl_type_of_users(r_node)) {
!does_any_user_have_impl_type(r_node, impl_types::onednn)) {
// If the newly aligned pad is merged into output layout during post_optimize_graph phase
// and then buffer is reinterpreted, user node cannot handle pad properly for kernel execution
if (!update_implementations || (i_layout.feature() % 16 == 0 &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -756,6 +756,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {

if (new_input.first) {
p.add_intermediate(new_input.first, detection_output_node, i, !new_input.second);
detection_output_node.recalc_output_layouts();
}
}
}
Expand All @@ -770,6 +771,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
layout{ input_layout.get_partial_shape(), input_layout.data_type, new_format });
if (reorder.first) {
p.add_intermediate(reorder.first, deconv_node, 0, !reorder.second);
deconv_node.recalc_output_layouts();
}
}

Expand Down Expand Up @@ -893,6 +895,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
auto new_input = rf.get_reorder(input.id(), input_layout, new_layout);
if (new_input.first) {
p.add_intermediate(new_input.first, fc_node, 0, !new_input.second);
fc_node.recalc_output_layouts();
}
}

Expand All @@ -919,6 +922,7 @@ void reorder_inputs::run(program& p, reorder_factory& rf) {
auto new_input = rf.get_reorder(input->id(), dep.second, input_layout, new_layout);
if (new_input.first) {
p.add_intermediate(new_input.first, pooling_node, 0);
pooling_node.recalc_output_layouts();
}
}
};
Expand Down
1 change: 0 additions & 1 deletion src/plugins/intel_gpu/src/graph/include/layout_optimizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,6 @@ class layout_optimizer {
static bool onednn_check_data_types_for_convolution(data_types in_dt, data_types wei_dt, data_types out_dt);
static bool onednn_check_data_types_for_deconvolution(data_types in_dt, data_types wei_dt, data_types out_dt);
static bool onednn_check_data_types_for_fc_gemm(data_types in_dt, data_types wei_dt, data_types out_dt);
static bool onednn_check_preferred_impl_type_of_users(program_node& node);
bool is_primitive_implemented_for_onednn(program_node& node);
bool is_format_supported(program_node& node, format::type fmt);

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,6 @@ class typed_primitive_inst<paged_attention> : public typed_primitive_inst_base<p

protected:
void on_execute() override;

void update_shape_info_tensor(const kernel_impl_params& params) override;
};

using paged_attention_inst = typed_primitive_inst<paged_attention>;
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/include/primitive_inst.h
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,7 @@ class primitive_inst {
bool reset_mem = true,
bool runtime_alloc = false);
memory::ptr allocate_internal_buffer(size_t idx, bool reset = true);
void allocate_shape_info_memory();
static std::vector<primitive_inst*> build_exec_deps(
std::vector<std::pair<primitive_inst*, int32_t>> const& mem_deps);
int32_t get_index_in_deps(memory::cptr arg) const;
Expand Down
3 changes: 3 additions & 0 deletions src/plugins/intel_gpu/src/graph/kv_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ int32_t kv_cache_inst::get_prealloc_iter_num() {
}

void kv_cache_inst::update_shape_info_tensor(const kernel_impl_params& params) {
if (!_shape_info_memory) {
allocate_shape_info_memory();
}
mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
auto shape_info_ptr = lock.data();
size_t offset = 0;
Expand Down
12 changes: 0 additions & 12 deletions src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1434,18 +1434,6 @@ bool layout_optimizer::is_primitive_implemented_for_onednn(program_node& node) {
return false;
}

bool layout_optimizer::onednn_check_preferred_impl_type_of_users(program_node& node) {
if (node.get_users().size() == 0)
return false;

for (auto& user : node.get_users()) {
if (user->get_preferred_impl_type() == impl_types::onednn)
return true;
}

return false;
}

impl_types layout_optimizer::get_forced_impl_type_by_config(program_node& node) {
#ifdef GPU_DEBUG_CONFIG
GPU_DEBUG_GET_INSTANCE(debug_config);
Expand Down
4 changes: 0 additions & 4 deletions src/plugins/intel_gpu/src/graph/paged_attention.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -143,10 +143,6 @@ void paged_attention_inst::on_execute() {
}
}

void paged_attention_inst::update_shape_info_tensor(const kernel_impl_params& params) {
parent::update_shape_info_tensor(params);
}

paged_attention_inst::typed_primitive_inst(network& network, const paged_attention_node& node)
: parent(network, node) {
const auto desc = node.get_primitive();
Expand Down
11 changes: 9 additions & 2 deletions src/plugins/intel_gpu/src/graph/primitive_inst.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -945,7 +945,16 @@ void primitive_inst::fill_shape_info_data(const layout& runtime_layout, const la
}
}

void primitive_inst::allocate_shape_info_memory() {
int64_t shape_elements = _node->get_total_shape_info_size();
_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx}, false);
}

void primitive_inst::update_shape_info_tensor(const kernel_impl_params& params) {
if (!_shape_info_memory) {
allocate_shape_info_memory();
}

mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
auto shape_info_ptr = lock.data();
size_t offset = 0;
Expand Down Expand Up @@ -1858,8 +1867,6 @@ primitive_inst::primitive_inst(network & network, program_node const& node, bool
if (_impl->is_dynamic() && !_impl->is_cpu()) {
GPU_DEBUG_TRACE_DETAIL << id() << ": initialize impl with dynamic impl " << _impl->get_kernel_name() << std::endl;
_dynamic_impl = _impl->clone();
const int64_t shape_elements = node.get_total_shape_info_size();
_shape_info_memory = _network.get_engine().allocate_memory(layout{{shape_elements}, data_types::i32, format::bfyx});
}
}
_impl_params->strm = _network.get_stream_ptr();
Expand Down
1 change: 1 addition & 0 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -813,6 +813,7 @@ void program::apply_needed_padding(program_node& node, program_node& prev_node,

auto r_prim = std::make_shared<reorder>("reorder_input_" + node.id(), prev_node.id(), target_layout);
add_intermediate(r_prim, node, 0);
get_or_create(r_prim).recalc_output_layouts(false);
return;
}

Expand Down
4 changes: 4 additions & 0 deletions src/plugins/intel_gpu/src/graph/slice.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,10 @@ std::string slice_inst::to_string(slice_node const& node) {
}

void slice_inst::update_shape_info_tensor(const kernel_impl_params& params) {
if (!_shape_info_memory) {
allocate_shape_info_memory();
}

mem_lock<int32_t> lock(_shape_info_memory, _network.get_stream());
auto shape_info_ptr = lock.data();
size_t offset = 0;
Expand Down
Loading

0 comments on commit 94a9675

Please sign in to comment.