Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ARM CPU] Add ACL FC executor for FP32/FP16 precision #24123

Merged
merged 6 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright (C) 2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "acl_common_executor.hpp"
#include "acl_utils.hpp"
#include "nodes/executors/memory_arguments.hpp"
#include "utils/debug_capabilities.h"

namespace ov {
namespace intel_cpu {

static const std::unordered_map<int, ACLArgs> argConvert = {
{ARG_SRC_0, ACL_SRC_0},
{ARG_SRC_1, ACL_SRC_1},
{ARG_SRC_2, ACL_SRC_2},
{ARG_BIAS, ACL_BIAS},
{ARG_WEI, ACL_WEI},
{ARG_DST, ACL_DST},
};

using ACLTypes = std::array<arm_compute::DataType, ACLArgs::COUNT_OF_ARGS>;
using ACLLayouts = std::array<arm_compute::DataLayout, ACLArgs::COUNT_OF_ARGS>;

static void initACLTensorParams(const MemoryPtr& memoryPtr,
const ACLTensorAttrs& attrs,
arm_compute::TensorShape& tensorShape,
arm_compute::DataType& dataType,
arm_compute::DataLayout& dataLayout) {
dataType = precisionToAclDataType(memoryPtr->getPrecision());
dataLayout = getAclDataLayoutByMemoryDesc(memoryPtr->getDescPtr());
if (dataType != arm_compute::DataType::UNKNOWN) {
auto collapsed_dims = collapse_dims_to_max_rank(memoryPtr->getStaticDims(), attrs.maxDimsShape);
tensorShape = shapeCast(collapsed_dims);
if (attrs.hasLayoutTypeNHWC) {
changeLayoutToNH_C({&tensorShape});
}
}
}

static std::shared_ptr<arm_compute::TensorInfo> initTensorInfo(const arm_compute::TensorShape& tensorShape,
const arm_compute::DataType& dataType,
const arm_compute::DataLayout& dataLayout) {
std::shared_ptr<arm_compute::TensorInfo> aclMemoryInfo = nullptr;
if (dataType != arm_compute::DataType::UNKNOWN) {
aclMemoryInfo = std::make_shared<arm_compute::TensorInfo>(
tensorShape, 1,
dataType,
dataLayout);
}
return aclMemoryInfo;
}

static std::shared_ptr<arm_compute::Tensor> initTensor(const std::shared_ptr<arm_compute::TensorInfo>& aclMemoryInfo) {
std::shared_ptr<arm_compute::Tensor> aclMemory = nullptr;
if (aclMemoryInfo) {
aclMemory = std::make_shared<arm_compute::Tensor>();
aclMemory->allocator()->init(*aclMemoryInfo);
}
return aclMemory;
}

ACLCommonExecutor::ACLCommonExecutor() {
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; ++i) {
aclTensorAttrs.memoryUsageIndicator[i] = false;
}
}

bool ACLCommonExecutor::update(const MemoryArgs &memory) {
// Initialize ACL tensors params
ACLShapes aclMemoryShapes;
ACLTypes aclDataType{};
ACLLayouts aclDataLayout{};
for (auto& cpu_mem_ptr : memory) {
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
initACLTensorParams(cpu_mem_ptr.second, aclTensorAttrs,
aclMemoryShapes[index],
aclDataType[index],
aclDataLayout[index]);
}

// Update ACL tensors shapes
updateTensorsShapes(aclMemoryShapes);

// Initialize arm_compute::TensorInfo objects
ACLInfos aclMemoryInfos;
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
aclMemoryInfos[i] = initTensorInfo(aclMemoryShapes[i], aclDataType[i], aclDataLayout[i]);
}

// Validate arm_compute::TensorInfo objects for specific ACL function
auto tensorsInfoValidateStatus = validateTensorsInfo(aclMemoryInfos);
if (!tensorsInfoValidateStatus) {
DEBUG_LOG("ACL operator validation failed: ", tensorsInfoValidateStatus.error_description());
return false;
}

// Initialize arm_compute::Tensor objects
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
aclMemoryTensors[i] = initTensor(aclMemoryInfos[i]);
// Indicate that arm_compute::Tensor object can use import_memory function
if (aclMemoryTensors[i]) {
aclTensorAttrs.memoryUsageIndicator[i] = true;
}
}

// Configure arm_compute::IFunction object
configureThreadSafe([&] {
iFunction = configureFunction(aclMemoryTensors);
});
return true;
}

void ACLCommonExecutor::execute(const MemoryArgs &memory) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I propose to leave a todo regarding the fact, that actually it should be enough to import_memory just once in scope of "update()" method, but it is not working for some reason and should be investigated.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

// TODO: Move import_memory() to update() function - CVS-145871
for (auto& cpu_mem_ptr : memory) {
const ACLArgs index = argConvert.at(cpu_mem_ptr.first);
if (aclTensorAttrs.memoryUsageIndicator[index]) {
aclMemoryTensors[index]->allocator()->import_memory(memory.at(cpu_mem_ptr.first)->getData());
}
}
iFunction->run();
}

ACLCommonExecutor::~ACLCommonExecutor() {
for (int i = 0; i < ACLArgs::COUNT_OF_ARGS; i++) {
if (aclTensorAttrs.memoryUsageIndicator[i]) {
aclMemoryTensors[i]->allocator()->free();
}
}
}

} // namespace intel_cpu
} // namespace ov
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "cpu_memory.h"
#include "nodes/executors/executor.hpp"
#include "arm_compute/runtime/NEON/NEFunctions.h"

namespace ov {
namespace intel_cpu {

enum ACLArgs {
ACL_SRC_0,
ACL_SRC_1,
ACL_SRC_2,
ACL_BIAS,
ACL_WEI,
ACL_DST,
COUNT_OF_ARGS
};

using ACLFunction = std::unique_ptr<arm_compute::IFunction>;
using ACLShapes = std::array<arm_compute::TensorShape, ACLArgs::COUNT_OF_ARGS>;
using ACLInfos = std::array<std::shared_ptr<arm_compute::TensorInfo>, ACLArgs::COUNT_OF_ARGS>;
using ACLTensors = std::array<std::shared_ptr<arm_compute::Tensor>, ACLArgs::COUNT_OF_ARGS>;

struct ACLTensorAttrs {
bool hasLayoutTypeNHWC = false;
size_t maxDimsShape = arm_compute::MAX_DIMS;
std::array<bool, ACLArgs::COUNT_OF_ARGS> memoryUsageIndicator;
};

class ACLCommonExecutor : public Executor {
public:
ACLCommonExecutor();
virtual void updateTensorsShapes(ACLShapes& aclMemoryShapes) = 0;
virtual arm_compute::Status validateTensorsInfo(const ACLInfos& aclMemoryInfos) = 0;
virtual ACLFunction configureFunction(const ACLTensors& aclMemoryTensors) = 0;
impl_desc_type implType() const override {
return impl_desc_type::acl;
}
void execute(const MemoryArgs& memory) override;
bool update(const MemoryArgs& memory) override;
~ACLCommonExecutor();

protected:
ACLTensorAttrs aclTensorAttrs;
private:
ACLTensors aclMemoryTensors;
ACLFunction iFunction = nullptr;
};

using ACLCommonExecutorPtr = std::shared_ptr<ACLCommonExecutor>;

} // namespace intel_cpu
} // namespace ov
119 changes: 17 additions & 102 deletions src/plugins/intel_cpu/src/nodes/executors/acl/acl_eltwise.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -361,66 +361,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseRelu:
if (aclEltwiseAttrs.alpha == 0) {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
ActivationLayerInfo::ActivationFunction::RELU))
return false;
} else {
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha}))
return false;
}
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
if (aclEltwiseAttrs.alpha == 0) {
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::RELU);
} else {
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LEAKY_RELU, aclEltwiseAttrs.alpha});
}
return acl_op;
};
break;
case Algorithm::EltwiseGeluErf:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::GELU))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::GELU);
return acl_op;
};
break;
case Algorithm::EltwiseElu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], {ActivationLayerInfo::ActivationFunction::ELU, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwiseTanh:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f});
return acl_op;
};
break;
case Algorithm::EltwiseSigmoid:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::LOGISTIC))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::LOGISTIC);
return acl_op;
};
break;
case Algorithm::EltwiseAbs:
if (!NEAbsLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
Expand All @@ -430,24 +370,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseSqrt:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SQRT))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SQRT);
return acl_op;
};
break;
case Algorithm::EltwiseSoftRelu:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::SOFT_RELU);
return acl_op;
};
break;
case Algorithm::EltwiseExp:
if (!NEExpLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0]))
return false;
Expand All @@ -457,28 +379,6 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseClamp:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, aclEltwiseAttrs.beta, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwiseSwish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha}))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0],
{ActivationLayerInfo::ActivationFunction::SWISH, aclEltwiseAttrs.alpha});
return acl_op;
};
break;
case Algorithm::EltwisePrelu:
if (!NEPReluLayer::validate(&srcTensorsInfo[0], &srcTensorsInfo[1], &dstTensorsInfo[0]))
return false;
Expand All @@ -488,12 +388,27 @@ bool AclEltwiseExecutor::init(const EltwiseAttrs &eltwiseAttrs, const std::vecto
return acl_op;
};
break;
case Algorithm::EltwiseRelu:
case Algorithm::EltwiseGeluErf:
case Algorithm::EltwiseElu:
case Algorithm::EltwiseTanh:
case Algorithm::EltwiseSigmoid:
case Algorithm::EltwiseSqrt:
case Algorithm::EltwiseSoftRelu:
case Algorithm::EltwiseClamp:
case Algorithm::EltwiseSwish:
case Algorithm::EltwiseHswish:
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH))
if (!NEActivationLayer::validate(&srcTensorsInfo[0], &dstTensorsInfo[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm,
aclEltwiseAttrs.alpha,
aclEltwiseAttrs.beta,
aclEltwiseAttrs.gamma)))
return false;
exec_func = [this]() -> std::unique_ptr<IFunction> {
auto acl_op = std::make_unique<NEActivationLayer>();
acl_op->configure(&srcTensors[0], &dstTensors[0], ActivationLayerInfo::ActivationFunction::HARD_SWISH);
acl_op->configure(&srcTensors[0], &dstTensors[0], getActivationLayerInfo(aclEltwiseAttrs.algorithm,
aclEltwiseAttrs.alpha,
aclEltwiseAttrs.beta,
aclEltwiseAttrs.gamma));
return acl_op;
};
break;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ class AclEltwiseExecutor : public EltwiseExecutor {
explicit AclEltwiseExecutor(const ExecutorContext::CPtr context);
static bool isEltwiseAlgorithmSupported(Algorithm algorithm);

bool init(const EltwiseAttrs& eltwiseAttrs,
bool init(const EltwiseAttrs& attrs,
const std::vector<MemoryDescPtr>& srcDescs,
const std::vector<MemoryDescPtr>& dstDescs,
const std::vector<EltwisePostOp>& postOps) override;
Expand Down
Loading
Loading