Skip to content

Commit

Permalink
[Snippets][CPU] Implemented BrgemmCopyB specific buffers
Browse files Browse the repository at this point in the history
  • Loading branch information
a-sidorova committed Sep 5, 2024
1 parent 7b790a6 commit 4d607ea
Show file tree
Hide file tree
Showing 10 changed files with 237 additions and 131 deletions.
14 changes: 14 additions & 0 deletions src/common/snippets/include/snippets/lowered/linear_ir.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,20 @@ class LinearIR {
return std::make_pair(expr_it, node);
}

/**
* @brief Insert new Expression to LinearIR, sets `loops_ids` as loop identifiers and inserts the expression on the `place` in LinearIR.
* Also connects output ports to `consumers`
* @param new_expr the target expr which were created by ExpressionFactory
* @param loop_ids vector of loops ids that will be set for the expression
* @param update_loop_ports true - the helpers updates the corresponding loop ports after insertion otherwise - skip
* @param place before this place expression will be inserted
* @param consumers vector of expression port sets. These expression ports will be consumers of the expression.
* The vector may be empty or size of vector must be equal to output port count
* @return new expression iterator in LinearIR
*/
exprIt insert_expr(const ExpressionPtr& new_expr, const std::vector<size_t>& loop_ids,
bool update_loop_ports, const constExprIt& place, const std::vector<std::set<ExpressionPort>>& consumers);

/**
* @brief Replace the several existing expressions with the one new expression that contains `new_node`.
* Calls the helper `insert_node` and performs substitution: removes `old_exprs`.
Expand Down
12 changes: 12 additions & 0 deletions src/common/snippets/src/lowered/linear_ir.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,18 @@ LinearIR::exprIt LinearIR::insert_node(const std::shared_ptr<ov::Node>& new_node
return insert_node(new_node, new_inputs, loop_ids, update_loop_ports, place, consumers);
}

LinearIR::exprIt LinearIR::insert_expr(const ExpressionPtr& new_expr, const std::vector<size_t>& loop_ids,
bool update_loop_ports, const constExprIt& place, const std::vector<std::set<ExpressionPort>>& consumers) {
update_consumers_and_regs(new_expr, consumers);
new_expr->set_loop_ids(loop_ids);

const auto expr_it = insert(place, new_expr);
if (update_loop_ports)
get_loop_manager()->update_loop_ports(new_expr);

return expr_it;
}

LinearIR::exprIt LinearIR::replace_with_node(const std::vector<ExpressionPtr>& old_exprs, const std::shared_ptr<ov::Node>& new_node,
const std::vector<size_t>& loop_ids, const constExprIt& place) {
OPENVINO_ASSERT(!old_exprs.empty(), "Failed to replace node: there are no old expressions for replacing");
Expand Down
7 changes: 4 additions & 3 deletions src/plugins/intel_cpu/src/nodes/subgraph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "snippets/lowered/pass/optimize_domain.hpp"
#include "snippets/lowered/pass/insert_loops.hpp"
#include "snippets/lowered/pass/mark_loops.hpp"
#include "snippets/lowered/pass/insert_buffers.hpp"
#include "transformations/defs.hpp"
#include "transformations/cpu_opset/common/pass/convert_to_swish_cpu.hpp"
#include "transformations/snippets/common/pass/mul_add_to_fma.hpp"
Expand All @@ -32,7 +33,7 @@
#include "emitters/snippets/x64/cpu_generator.hpp"
#include "transformations/snippets/x64/pass/lowered/brgemm_cpu_blocking.hpp"
#include "transformations/snippets/x64/pass/lowered/fuse_load_store_and_convert.hpp"
#include "transformations/snippets/x64/pass/lowered/set_brgemm_copy_b_buffers_shape.hpp"
#include "transformations/snippets/x64/pass/lowered/insert_brgemm_copy_b_buffers.hpp"
#include "transformations/snippets/x64/pass/remove_converts.hpp"
#include "transformations/snippets/x64/pass/brgemm_to_brgemm_cpu.hpp"
#include "transformations/snippets/x64/pass/enforce_precision.hpp"
Expand Down Expand Up @@ -681,8 +682,8 @@ Subgraph::ControlFlowPasses Subgraph::getControlFlowPasses() const {
ov::intel_cpu::pass::BrgemmCPUBlocking);
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::snippets::lowered::pass::InsertLoops,
ov::intel_cpu::pass::FuseLoadStoreConvert);
SNIPPETS_REGISTER_PASS_RELATIVE(Place::After, ov::intel_cpu::pass::FuseLoadStoreConvert,
ov::intel_cpu::pass::SetBrgemmCopyBBuffersShape);
SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::snippets::lowered::pass::InsertBuffers,
ov::intel_cpu::pass::InsertBrgemmCopyBBuffers);

#ifdef SNIPPETS_LIBXSMM_TPP
SNIPPETS_REGISTER_PASS_RELATIVE(Place::Before, ov::intel_cpu::pass::BrgemmCPUBlocking,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,46 +77,6 @@ size_t get_elems_in_vec(const ov::element::Type& precision) {
}

namespace repacking {
size_t get_repacking_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr) {
OPENVINO_ASSERT(ov::is_type<ov::intel_cpu::BrgemmCopyB>(copy_b_expr->get_node()));
const auto& in_desc = copy_b_expr->get_input_port_descriptor(0);
const auto& in_layout = in_desc->get_layout();
const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(copy_b_expr->get_input_port(0));

const size_t n_blk = *in_subtensor.rbegin();
const size_t k_blk = *++in_subtensor.rbegin();
OPENVINO_ASSERT(!is_dynamic_value(n_blk) && !is_dynamic_value(k_blk), "get_repacking_buffer_size must be called with static subtensor values");

const auto& precision = copy_b_expr->get_node()->get_input_element_type(0);
// Repacking buffer shape is set in accordance to OneDNN requirements
const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision));
if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) {
// In case of transpose, K dimension must be rounded-up to number of elems in vector register
// For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
const auto elems_in_vec = brgemm_utils::get_elems_in_vec(precision);
return N_dim * rnd_up(k_blk, elems_in_vec);
} else {
// Low precision repacking writes the result by m_brgemmVNNIFactor * m_inner_n_block blocks
// despite the actual size of the input data. Because of that we have to round-up the allocation shape to always have enough memory allocated.
// For the details, please see 'copy_4x64' and 'copy_2x32' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
const auto brgemmVNNIFactor = brgemm_utils::compute_vnni_factor(precision);
OPENVINO_ASSERT(brgemmVNNIFactor > 0, "brgemmVNNIFactor value must be positive.");
return N_dim * rnd_up(k_blk, brgemmVNNIFactor);
}
}

size_t get_compensations_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr) {
OPENVINO_ASSERT(ov::is_type<ov::intel_cpu::BrgemmCopyB>(copy_b_expr->get_node()));
const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(copy_b_expr->get_input_port(0));
const size_t n_blk = *in_subtensor.rbegin();
OPENVINO_ASSERT(!is_dynamic_value(n_blk), "get_compensations_buffer_size must be called with static subtensor values");
const auto& precision = copy_b_expr->get_node()->get_input_element_type(0);
// Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block
// because of OneDNN implementation nuances (as in get_repacking_buffer_size).
// However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer
return std::max(n_blk, compute_inner_n_block(precision));
}

size_t compute_out_leading_dim(const size_t n_block, const ov::element::Type& precision) {
return std::max(n_block, compute_inner_n_block(precision));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,6 @@ size_t compute_vnni_factor(const ov::element::Type& precision);
size_t get_elems_in_vec(const ov::element::Type& precision);

namespace repacking {
/**
* @brief Computes buffer size that OneDNN impl needs for repacked tensor
* @param copy_b_expr Repacking expression whose information (tensor precision, layout, subtensors) is used for
* buffer size computations
*/
size_t get_repacking_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr);
/**
* @brief Computes buffer size that OneDNN impl needs for compensations
* @param copy_b_expr Repacking expression whose information (tensor precision, subtensors) is used for
* buffer size computations
*/
size_t get_compensations_buffer_size(const ov::snippets::lowered::ExpressionPtr& copy_b_expr);
/**
* @brief Computes leading dimension (LDB) which must be used in brgemm and brgemm_copy_b emitters
* @param n_block N block size shared between BrgemmCPU and BrgemmCopyB node
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "insert_brgemm_copy_b_buffers.hpp"

#include "snippets/lowered/loop_manager.hpp"
#include "snippets/snippets_isa.hpp"
#include "snippets/utils/utils.hpp"
#include "snippets/itt.hpp"

#include "transformations/snippets/x64/op/brgemm_copy_b.hpp"
#include "utils/general_utils.h"


using namespace ov::intel_cpu::brgemm_utils::repacking;
using namespace ov::snippets::lowered;

namespace ov {
namespace intel_cpu {
namespace pass {

bool InsertBrgemmCopyBBuffers::run(LinearIR& linear_ir, LinearIR::constExprIt begin, LinearIR::constExprIt end) {
OV_ITT_SCOPED_TASK(ov::pass::itt::domains::SnippetsTransform, "Snippets::InsertBrgemmCopyBBuffers")

const auto& factory = linear_ir.get_expr_factory();

auto insert_buffer = [&](const ExpressionPtr& copy_b_expr, size_t out_port, LinearIR::constExprIt insertion_pos) {
const auto& copy_b = ov::as_type_ptr<ov::intel_cpu::BrgemmCopyB>(copy_b_expr->get_node());
const auto& copy_b_out = copy_b_expr->get_output_port_connector(out_port);
const auto copy_b_consumers = copy_b_out->get_consumers();
OPENVINO_ASSERT(copy_b_consumers.size() == 1, "BufferCopyB must have only one consumer on each out port - Brgemm");
const auto& buffer_op = std::make_shared<ov::snippets::op::Buffer>(copy_b->output(out_port));
BufferExpressionPtr buffer_expr = nullptr;
if (out_port == 0) {
buffer_expr = factory->build<RepackedWeightsBufferExpression>(buffer_op, {copy_b_out});
} else if (out_port == 1 && with_compensations(copy_b->get_type())) {
buffer_expr = factory->build<CompensationsBufferExpression>(buffer_op, {copy_b_out});
} else {
OPENVINO_THROW("BrgemmCopyB has incorrect output ports");
}
return linear_ir.insert_expr(buffer_expr, LoopManager::get_common_outer_loops(copy_b_expr, copy_b_consumers.begin()->get_expr()),
true, insertion_pos, {copy_b_consumers});
};

bool modified = false;
for (auto expr_it = begin; expr_it != end; ++expr_it) {
const auto expr = *expr_it;
if (auto copy_b = ov::as_type_ptr<ov::intel_cpu::BrgemmCopyB>(expr->get_node())) {
for (size_t i = 0; i < expr->get_output_count(); ++i) {
expr_it = insert_buffer(expr, i, std::next(expr_it)); // we insert after expression
}
modified = true;
}
}
return modified;
}

InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::RepackedWeightsBufferExpression(const snippets::lowered::BufferExpression& other)
: BufferExpression(other) {}

InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::RepackedWeightsBufferExpression(const std::shared_ptr<ov::Node>& n,
const std::shared_ptr<snippets::IShapeInferSnippetsFactory>& factory) : BufferExpression(n, factory) {}

snippets::lowered::ExpressionPtr InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::clone() const {
return std::shared_ptr<RepackedWeightsBufferExpression>(new RepackedWeightsBufferExpression(*this));
}

void InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::validate() const {
BufferExpression::validate();
OPENVINO_ASSERT(get_input_count() == 1, "RepackedWeightsBufferExpression must have only one input");
const auto& parent_out = get_input_port_connector(0)->get_source();
OPENVINO_ASSERT(ov::is_type<ov::intel_cpu::BrgemmCopyB>(parent_out.get_expr()->get_node()) && parent_out.get_index() == 0,
"RepackedWeightsBufferExpression expects BrgemmCopyB as parent expression");
}

void InsertBrgemmCopyBBuffers::RepackedWeightsBufferExpression::init_allocation_size(const std::shared_ptr<snippets::lowered::LoopManager>& loop_manager,
size_t allocation_rank) {
const auto& parent_expr = get_input_port_connector(0)->get_source().get_expr();
const auto& in_layout = parent_expr->get_input_port_descriptor(0)->get_layout();
const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0));

const size_t n_blk = *in_subtensor.rbegin();
const size_t k_blk = *++in_subtensor.rbegin();
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(n_blk) && !ov::snippets::utils::is_dynamic_value(k_blk),
"RepackedWeightsBufferExpression supports only static subtensor values");

const auto& precision = get_node()->get_input_element_type(0);
// Repacking buffer shape is set in accordance to OneDNN requirements
const size_t N_dim = std::max(n_blk, compute_inner_n_block(precision));
if (!in_layout.empty() && in_layout.back() != in_layout.size() - 1) {
// In case of transpose, K dimension must be rounded-up to number of elems in vector register
// For the details, please see 'transpose16x8' and 'fixup16x16' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
const auto elems_in_vec = brgemm_utils::get_elems_in_vec(precision);
m_allocation_size = N_dim * rnd_up(k_blk, elems_in_vec);
} else {
// Low precision repacking writes the result by m_brgemmVNNIFactor * m_inner_n_block blocks
// despite the actual size of the input data. Because of that we have to round-up the allocation shape to always have enough memory allocated.
// For the details, please see 'copy_4x64' and 'copy_2x32' implementations and usage in onednn/src/cpu/x64/matmul/brgemm_matmul_copy_utils.cpp
const auto brgemmVNNIFactor = brgemm_utils::compute_vnni_factor(precision);
OPENVINO_ASSERT(brgemmVNNIFactor > 0, "brgemmVNNIFactor value must be positive.");
m_allocation_size = N_dim * rnd_up(k_blk, brgemmVNNIFactor);
}
}

InsertBrgemmCopyBBuffers::CompensationsBufferExpression::CompensationsBufferExpression(const snippets::lowered::BufferExpression& other)
: BufferExpression(other) {}

InsertBrgemmCopyBBuffers::CompensationsBufferExpression::CompensationsBufferExpression(const std::shared_ptr<ov::Node>& n,
const std::shared_ptr<snippets::IShapeInferSnippetsFactory>& factory) : BufferExpression(n, factory) {}

snippets::lowered::ExpressionPtr InsertBrgemmCopyBBuffers::CompensationsBufferExpression::clone() const {
return std::shared_ptr<CompensationsBufferExpression>(new CompensationsBufferExpression(*this));
}

void InsertBrgemmCopyBBuffers::CompensationsBufferExpression::validate() const {
BufferExpression::validate();
OPENVINO_ASSERT(get_input_count() == 1, "CompensationsBufferExpression must have only one input");
const auto& parent_out = get_input_port_connector(0)->get_source();
OPENVINO_ASSERT(ov::is_type<ov::intel_cpu::BrgemmCopyB>(parent_out.get_expr()->get_node()) && parent_out.get_index() == 1,
"CompensationsBufferExpression expects BrgemmCopyB as parent expression");
}

void InsertBrgemmCopyBBuffers::CompensationsBufferExpression::init_allocation_size(const std::shared_ptr<snippets::lowered::LoopManager>& loop_manager,
size_t allocation_rank) {
const auto& parent_expr = get_input_port_connector(0)->get_source().get_expr();
const auto& in_subtensor = ov::snippets::utils::get_projected_subtensor(parent_expr->get_input_port(0));
const size_t n_blk = *in_subtensor.rbegin();
OPENVINO_ASSERT(!ov::snippets::utils::is_dynamic_value(n_blk), "CompensationsBufferExpression supports only static subtensor values");
const auto& precision = parent_expr->get_node()->get_input_element_type(0);
// Compensations are computed during repacking, so we need to round-up allocation shape according to m_inner_n_block
// because of OneDNN implementation nuances (as in get_repacking_buffer_size).
// However, the compensations are computed by N dimension, so K dimension doesn't affect the compensations buffer
m_allocation_size = std::max(n_blk, compute_inner_n_block(precision));
}

} // namespace pass
} // namespace intel_cpu
} // namespace ov

Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "snippets/lowered/pass/pass.hpp"

#include "snippets/lowered/expressions/buffer_expression.hpp"

namespace ov {
namespace intel_cpu {
namespace pass {

/**
* @interface InsertBrgemmCopyBBuffers
* @brief Insert Buffers after BrgemmCopyB with algorithm of allocation size calculation which
* distinguishes with common algorithm
* @ingroup snippets
*/
class InsertBrgemmCopyBBuffers: public snippets::lowered::pass::RangedPass {
public:
InsertBrgemmCopyBBuffers() = default;
OPENVINO_RTTI("InsertBrgemmCopyBBuffers", "Pass");
bool run(snippets::lowered::LinearIR& linear_ir, snippets::lowered::LinearIR::constExprIt begin, snippets::lowered::LinearIR::constExprIt end) override;

private:
class RepackedWeightsBufferExpression : public snippets::lowered::BufferExpression {
friend class snippets::lowered::ExpressionFactory;
public:
OPENVINO_RTTI("RepackedWeightsBufferExpression", "0", BufferExpression)
RepackedWeightsBufferExpression() = default;

void validate() const override;

void init_allocation_size(const std::shared_ptr<snippets::lowered::LoopManager>& loop_manager, size_t allocation_rank) override;

private:
RepackedWeightsBufferExpression(const snippets::lowered::BufferExpression& other);
RepackedWeightsBufferExpression(const std::shared_ptr<ov::Node>& n, const std::shared_ptr<snippets::IShapeInferSnippetsFactory>& factory);

snippets::lowered::ExpressionPtr clone() const override;
};

class CompensationsBufferExpression : public snippets::lowered::BufferExpression {
friend class snippets::lowered::ExpressionFactory;
public:
OPENVINO_RTTI("CompensationsBufferExpression", "0", BufferExpression)
CompensationsBufferExpression() = default;

void validate() const override;

void init_allocation_size(const std::shared_ptr<snippets::lowered::LoopManager>& loop_manager, size_t allocation_rank) override;

private:
CompensationsBufferExpression(const snippets::lowered::BufferExpression& other);
CompensationsBufferExpression(const std::shared_ptr<ov::Node>& n, const std::shared_ptr<snippets::IShapeInferSnippetsFactory>& factory);

snippets::lowered::ExpressionPtr clone() const override;
};
};

} // namespace pass
} // namespace intel_cpu
} // namespace ov
Loading

0 comments on commit 4d607ea

Please sign in to comment.