Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CPU] MVN node refactor #26729

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
137 changes: 51 additions & 86 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.cpp
Original file line number Diff line number Diff line change
@@ -1,115 +1,80 @@
// Copyright (C) 2023 Intel Corporation
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_mvn.hpp"
#include "acl_utils.hpp"

namespace ov {
namespace intel_cpu {

using namespace arm_compute;

AclMVNExecutor::AclMVNExecutor(const ExecutorContext::CPtr context) : MVNExecutor(context) {}
bool ACLMVNExecutor::supports(const MVNConfig &config) {
if (config.attrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) {
DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support OUTSIDE_SQRT mode");
return false;
}
if (!config.attrs.normalizeVariance_) {
DEBUG_LOG("NEMeanStdDevNormalizationLayer supports normalize_variance=true only");
return false;
}
return true;
}

bool AclMVNExecutor::init(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) {
auto srcDims = srcDescs[0]->getShape().getStaticDims();
auto dstDims = dstDescs[0]->getShape().getStaticDims();
void ACLMVNExecutor::updateTensorsShapes(ACLShapes& aclMemoryShapes) {
const auto srcDims = aclMemoryShapes[ACLArgs::ACL_SRC_0];
const auto srcNumDim = aclMemoryShapes[ACLArgs::ACL_SRC_0].num_dimensions();

size_t X, Y;
if (mvnAttrs.initAcrossChannels_) {
if (srcDims.size() >= 2u) {
Y = srcDims[0];
X = srcDims[1];
for (size_t i = 2; i < srcDims.size(); i++) {
X *= srcDims[i];
if (aclMVNAtrrs.initAcrossChannels_) {
if (srcDims.num_dimensions() >= 2u) {
Y = srcDims[srcNumDim - 1];
X = srcDims[srcNumDim - 2];
for (size_t i = 2; i < srcDims.num_dimensions(); i++) {
X *= srcDims[srcNumDim - i - 1];
}
} else {
Y = 1;
X = srcDims[0];
X = srcDims[srcNumDim - 1];
}
} else {
if (srcDims.size() > 2u) {
Y = srcDims[0] * srcDims[1];
X = srcDims[2];
for (size_t i = 3; i < srcDims.size(); i++) {
X *= srcDims[i];
if (srcDims.num_dimensions() > 2u) {
Y = srcDims[srcNumDim - 1] * srcDims[srcNumDim - 2];
X = srcDims[srcNumDim - 3];
for (size_t i = 3; i < srcDims.num_dimensions(); i++) {
X *= srcDims[srcNumDim - i - 1];
}
} else if (srcDims.size() == 2u) {
Y = srcDims[0] * srcDims[1];
} else if (srcDims.num_dimensions() == 2u) {
Y = srcDims[srcNumDim - 1] * srcDims[srcNumDim - 2];
X = 1;
} else {
Y = srcDims[0];
Y = srcDims[srcNumDim - 1];
X = 1;
}
}

TensorInfo srcTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(srcDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(srcDescs[0]));
TensorInfo dstTensorInfo = TensorInfo(TensorShape(X, Y), 1, precisionToAclDataType(dstDescs[0]->getPrecision()), getAclDataLayoutByMemoryDesc(dstDescs[0]));


if (!arm_compute::NEMeanStdDevNormalizationLayer::validate(&srcTensorInfo, &dstTensorInfo, mvnAttrs.epsValue_))
return false;

srcTensor.allocator()->init(srcTensorInfo);
dstTensor.allocator()->init(dstTensorInfo);

mvn = std::make_unique<arm_compute::NEMeanStdDevNormalizationLayer>();
configureThreadSafe([&] { mvn->configure(&srcTensor, &dstTensor, mvnAttrs.epsValue_); });

return true;
aclMemoryShapes[ACLArgs::ACL_SRC_0] = aclMemoryShapes[ACLArgs::ACL_DST] = arm_compute::TensorShape(X, Y);
}

void AclMVNExecutor::exec(const std::vector<MemoryCPtr>& src, const std::vector<MemoryPtr>& dst, const void *post_ops_data_) {
srcTensor.allocator()->import_memory(src[0]->getData());
dstTensor.allocator()->import_memory(dst[0]->getData());

mvn->run();

srcTensor.allocator()->free();
dstTensor.allocator()->free();
arm_compute::Status ACLMVNExecutor::validateTensorsInfo(const ACLInfos &aclMemoryInfos) {
if (!aclMVNAtrrs.initAcrossChannels_ &&
aclMemoryInfos[ACLArgs::ACL_SRC_0]->data_layout() == arm_compute::DataLayout::NHWC) {
std::string error_description = "initAcrossChannels = false is not supported by ACL for NHWC layout";
DEBUG_LOG(error_description);
return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, error_description);
}
return arm_compute::NEMeanStdDevNormalizationLayer::validate(
aclMemoryInfos[ACLArgs::ACL_SRC_0].get(),
aclMemoryInfos[ACLArgs::ACL_DST].get(),
aclMVNAtrrs.epsValue_);
}

bool AclMVNExecutorBuilder::isSupported(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const {
if ((srcDescs[0]->getPrecision() != ov::element::f32 &&
srcDescs[0]->getPrecision() != ov::element::f16) ||
srcDescs[0]->getPrecision() != dstDescs[0]->getPrecision()) {
DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support precisions:",
" src[0]=", srcDescs[0]->getPrecision(),
" dst[0]=", dstDescs[0]->getPrecision());
return false;
}

if (!(srcDescs[0]->hasLayoutType(LayoutType::ncsp) &&
dstDescs[0]->hasLayoutType(LayoutType::ncsp)) &&
!(srcDescs[0]->hasLayoutType(LayoutType::nspc) &&
dstDescs[0]->hasLayoutType(LayoutType::nspc))) {
DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support layout:",
" src: ", srcDescs[0]->serializeFormat(),
" dst: ", dstDescs[0]->serializeFormat());
return false;
}

if (mvnAttrs.epsMode_ == MVNEpsMode::OUTSIDE_SQRT) {
DEBUG_LOG("NEMeanStdDevNormalizationLayer does not support OUTSIDE_SQRT mode");
return false;
}
if (!mvnAttrs.normalizeVariance_) {
DEBUG_LOG("NEMeanStdDevNormalizationLayer supports normalize_variance=true only");
return false;
}
if (!mvnAttrs.initAcrossChannels_ &&
srcDescs[0]->hasLayoutType(LayoutType::nspc)) {
DEBUG_LOG("initAcrossChannels = false is not supported by ACL for NHWC layout");
return false;
}

return true;
}
ACLFunction ACLMVNExecutor::configureFunction(const ACLTensors & aclMemoryTensors) {
auto neMVN = std::make_unique<arm_compute::NEMeanStdDevNormalizationLayer>();
neMVN->configure(
aclMemoryTensors[ACLArgs::ACL_SRC_0].get(),
aclMemoryTensors[ACLArgs::ACL_DST].get(),
aclMVNAtrrs.epsValue_);
return neMVN;
}

} // namespace intel_cpu
} // namespace ov
49 changes: 16 additions & 33 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_mvn.hpp
Original file line number Diff line number Diff line change
@@ -1,51 +1,34 @@
// Copyright (C) 2023 Intel Corporation
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "acl_utils.hpp"
#include "nodes/executors/mvn.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"
#include "utils/debug_capabilities.h"
#include "acl_common_executor.hpp"
#include "nodes/executors/mvn_config.hpp"

namespace ov {
namespace intel_cpu {

class AclMVNExecutor : public MVNExecutor {
class ACLMVNExecutor : public ACLCommonExecutor {
public:
AclMVNExecutor(const ExecutorContext::CPtr context);
ACLMVNExecutor(const MVNAttrs& attrs,
const PostOps& postOps,
const MemoryArgs& memory,
const ExecutorContext::CPtr context) : aclMVNAtrrs(attrs) {}

bool init(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const dnnl::primitive_attr &attr) override;
void exec(const std::vector<MemoryCPtr>& src,
const std::vector<MemoryPtr>& dst,
const void *post_ops_data_) override;
static bool supports(const MVNConfig& config);

impl_desc_type getImplType() const override {
return implType;
}
void updateTensorsShapes(ACLShapes& aclMemoryShapes) override;

private:
impl_desc_type implType = impl_desc_type::acl;
arm_compute::Status validateTensorsInfo(const ACLInfos & aclMemoryInfos) override;

arm_compute::Tensor srcTensor;
arm_compute::Tensor dstTensor;
std::unique_ptr<arm_compute::NEMeanStdDevNormalizationLayer> mvn = nullptr;
};
ACLFunction configureFunction(const ACLTensors & aclMemoryTensors) override;

class AclMVNExecutorBuilder : public MVNExecutorBuilder {
public:
bool isSupported(const MVNAttrs& mvnAttrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs) const override;

MVNExecutorPtr makeExecutor(const ExecutorContext::CPtr context) const override {
return std::make_shared<AclMVNExecutor>(context);
}
private:
MVNAttrs aclMVNAtrrs;
};

using ACLMVNExecutorPtr = std::shared_ptr<ACLMVNExecutor>;
} // namespace intel_cpu
} // namespace ov
} // namespace ov
129 changes: 129 additions & 0 deletions src/plugins/intel_cpu/src/nodes/executors/common/ref_mvn.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
// Copyright (C) 2023 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "ref_mvn.hpp"
#include "openvino/core/parallel.hpp"

void ov::intel_cpu::CommonMVNExecutor::execute(const ov::intel_cpu::MemoryArgs &memory) {
mvn_ref(reinterpret_cast<uint8_t *>(memory.at(ARG_SRC)->getData()),
reinterpret_cast<uint8_t *>(memory.at(ARG_DST)->getData()),
shape5D);
}

bool ov::intel_cpu::CommonMVNExecutor::update(const ov::intel_cpu::MemoryArgs &memory) {
shape5D = transformTo5DCase(memory.at(ARG_SRC)->getDescPtr()->getShape().getDims(), refMVNAttrs);
if (memory.at(ARG_SRC)->getDesc().hasLayoutType(LayoutType::ncsp)) {
refMVNAttrs.layout = MVNLayoutType::mvn_planar;
} else if (memory.at(ARG_SRC)->getDesc().hasLayoutType(LayoutType::nspc)) {
refMVNAttrs.layout = MVNLayoutType::mvn_by_channel;
} else {
refMVNAttrs.layout = MVNLayoutType::mvn_block;
}
return true;
}

bool ov::intel_cpu::CommonMVNExecutor::supports(const ov::intel_cpu::MVNConfig& config) {
return true;
}

void ov::intel_cpu::CommonMVNExecutor::mvn_ref(const uint8_t* src_data, uint8_t* dst_data, const VectorDims& shape5d) {
const float *src_data_ptr = reinterpret_cast<const float *>(src_data);
float *dst_data_ptr = reinterpret_cast<float *>(dst_data);
const size_t N = shape5d[0];
const size_t C = shape5d[1];
const size_t D = shape5d[2];
const size_t H = shape5d[3];
const size_t W = shape5d[4];

size_t C1 = H * W;
size_t C2 = C1 * D;
size_t C3 = C2 * C;

parallel_for(N, [&](int b) {
size_t cb = b * C3;
if (refMVNAttrs.execAcrossChannels_) {
// Parallel sum for each channel for mean
float C3inv = 1.f / static_cast<float>(C3);
float mean_temp = 0.0f;

mean_temp = parallel_sum(C, mean_temp, [&](size_t c)->float {
float mean_internal = 0.0f;
size_t cc = cb + c * C2;
for (size_t sp = 0lu; sp < C2; sp++) {
mean_internal += src_data_ptr[cc + sp];
}
return mean_internal;
});

float mean = mean_temp * C3inv;

if (refMVNAttrs.normalizeVariance_) {
// parallel sum for each channel for variance
float variance_temp = 0.0f;
variance_temp = parallel_sum(C, variance_temp, [&](size_t c)->float {
float variance_internal = 0.0f;
size_t cc = cb + c * C2;
for (size_t sp = 0lu; sp < C2; sp++) {
variance_internal += (src_data_ptr[cc + sp] - mean) * (src_data_ptr[cc + sp] - mean);
}
return variance_internal;
});

float variance = 1.f;
if (refMVNAttrs.epsMode_ == INSIDE_SQRT)
variance = 1.f / sqrtf(variance_temp * C3inv + refMVNAttrs.epsValue_);
else if (refMVNAttrs.epsMode_ == OUTSIDE_SQRT)
variance = 1.f / (sqrtf(variance_temp * C3inv) + refMVNAttrs.epsValue_);

parallel_for(C, [&](int c) {
size_t cc = cb + c * C2;
for (size_t sp = 0lu; sp < C2; sp++) {
dst_data_ptr[cc + sp] = (src_data_ptr[cc + sp] - mean) * variance;
}
});
} else {
parallel_for(C, [&](int c) {
size_t cc = cb + c * C2;
for (size_t sp = 0lu; sp < C2; sp++) {
dst_data_ptr[cc + sp] = src_data_ptr[cc + sp] - mean;
}
});
}
} else { // per channel
float C2inv = 1.f / static_cast<float>(C2);
parallel_for(C, [&](size_t c) {
// mean for this channel
float mean = 0.f;
size_t cc = cb + c * C2;
for (size_t sp = 0lu; sp < C2; sp++) {
mean += src_data_ptr[cc + sp];
}
mean *= C2inv;

if (refMVNAttrs.normalizeVariance_) {
// variance for this channel
float variance = 0.f;
for (size_t sp = 0lu; sp < C2; sp++) {
variance += (src_data_ptr[cc + sp] - mean) * (src_data_ptr[cc + sp] - mean);
}

if (refMVNAttrs.epsMode_ == INSIDE_SQRT)
variance = 1.f / sqrtf(variance * C2inv + refMVNAttrs.epsValue_);
else if (refMVNAttrs.epsMode_ == OUTSIDE_SQRT)
variance = 1.f / (sqrtf(variance * C2inv) + refMVNAttrs.epsValue_);

// mvn for this channel
for (size_t sp = 0lu; sp < C2; sp++) {
dst_data_ptr[cc + sp] = (src_data_ptr[cc + sp] - mean) * variance;
}
} else {
// mvn for this channel
for (size_t sp = 0lu; sp < C2; sp++) {
dst_data_ptr[cc + sp] = src_data_ptr[cc + sp] - mean;
}
}
});
}
});
}
Loading
Loading