Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Openvino ep ort 5.1 #309

Draft
wants to merge 21 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1236,9 +1236,12 @@ if (onnxruntime_USE_OPENVINO)
elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0")
set(OPENVINO_VERSION "2023.0")
add_definitions(-DOPENVINO_2023_0=1)
elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1")
set(OPENVINO_VERSION "2023.1")
add_definitions(-DOPENVINO_2023_1=1)
elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino")
set(OPENVINO_VERSION "2023.0")
add_definitions(-DOPENVINO_2023_0=1)
set(OPENVINO_VERSION "2023.1")
add_definitions(-DOPENVINO_2023_1=1)
else()
message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}")
endif()
Expand Down
6 changes: 4 additions & 2 deletions docs/python/ReadMeOV.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ OpenVINO™ Execution Provider for ONNX Runtime accelerates inference across man
- Intel® CPUs
- Intel® integrated GPUs
- Intel® discrete GPUs
- Intel® integrated VPUs

Installation
------------
Expand All @@ -15,12 +16,13 @@ Requirements
^^^^^^^^^^^^

- Ubuntu 18.04, 20.04, RHEL(CPU only) or Windows 10 - 64 bit
- Python 3.8, 3.9 or 3.10 for Linux and only Python3.10 for Windows
- Python 3.8 or 3.9 or 3.10 for Linux and only Python3.10 for Windows

This package supports:
- Intel® CPUs
- Intel® integrated GPUs
- Intel® discrete GPUs
- Intel® integrated VPUs

``pip3 install onnxruntime-openvino``

Expand All @@ -34,7 +36,7 @@ For more details on build and installation please refer to `Build <https://onnxr
Usage
^^^^^

By default, Intel® CPU is used to run inference. However, you can change the default option to either Intel® integrated or discrete GPU.
By default, Intel® CPU is used to run inference. However, you can change the default option to either Intel® integrated or discrete GPU.
Invoke `the provider config device type argument <https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#summary-of-options>`_ to change the hardware on which inferencing is done.

For more API calls and environment variables, see `Usage <https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#configuration-options>`_.
Expand Down
33 changes: 14 additions & 19 deletions onnxruntime/core/providers/openvino/backend_manager.cc
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,6 @@
#include <memory>

#include "core/providers/shared_library/provider_api.h"

#include <inference_engine.hpp>

#include "contexts.h"
#include "backend_manager.h"
#include "ibackend.h"
Expand All @@ -36,11 +33,11 @@ BackendManager::BackendManager(const onnxruntime::Node& fused_node,
const logging::Logger& logger) {
auto prec_str = GetGlobalContext().precision_str;
if (prec_str == "FP32") {
subgraph_context_.precision = InferenceEngine::Precision::FP32;
subgraph_context_.precision = "FP32";
} else if (prec_str == "FP16") {
subgraph_context_.precision = InferenceEngine::Precision::FP16;
subgraph_context_.precision = "FP16";
} else if (prec_str == "U8") {
subgraph_context_.precision = InferenceEngine::Precision::U8;
subgraph_context_.precision = "U8";
} else {
throw std::string("Invalid OpenVINO Precision type: " + prec_str);
}
Expand Down Expand Up @@ -78,19 +75,17 @@ BackendManager::BackendManager(const onnxruntime::Node& fused_node,
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has symbolic input dims";
if (GetGlobalContext().device_type.find("CPU") != std::string::npos ||
GetGlobalContext().device_type.find("GPU") != std::string::npos) {
if (GetGlobalContext().enable_dynamic_shapes) {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
<< "Creating backend Dynamic Shapes";
try {
concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
GetGlobalContext(),
subgraph_context_);
} catch (std::string const& msg) {
throw msg;
}
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
<< "Backend created for graph " << subgraph_context_.subgraph_name;
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Starting backend initialization. "
<< "Creating backend Dynamic Shapes";
try {
concrete_backend_ = BackendFactory::MakeBackend(*model_proto_,
GetGlobalContext(),
subgraph_context_);
} catch (std::string const& msg) {
throw msg;
}
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] "
<< "Backend created for graph " << subgraph_context_.subgraph_name;
}
} else {
LOGS_DEFAULT(INFO) << "[OpenVINO-EP] Model has concrete input dims. Initializing backend for graph " << subgraph_context_.subgraph_name;
Expand Down Expand Up @@ -257,7 +252,7 @@ void BackendManager::Compute(OrtKernelContext* context) {
}
#endif
bool use_dynamic_backend = true;
if (GetGlobalContext().enable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape &&
if (subgraph_context_.has_dynamic_input_shape &&
(GetGlobalContext().device_type.find("CPU") != std::string::npos ||
GetGlobalContext().device_type.find("GPU") != std::string::npos)) {
concrete_backend_->Infer(context);
Expand Down
64 changes: 23 additions & 41 deletions onnxruntime/core/providers/openvino/backend_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,8 @@
#include <fstream>

#include "ov_interface.h"
#include <ngraph/pass/convert_fp32_to_fp16.hpp>
#include <ngraph/pass/constant_folding.hpp>
#include "openvino/pass/convert_fp32_to_fp16.hpp"
#include "openvino/pass/constant_folding.hpp"
#include "core/providers/shared_library/provider_api.h"
#include "backend_utils.h"

Expand Down Expand Up @@ -50,14 +50,14 @@ struct static_cast_int64 {
std::shared_ptr<OVNetwork>
CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context,
const SubGraphContext& subgraph_context,
std::map<std::string, std::shared_ptr<ngraph::Node>>& const_outputs_map) {
std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
if (IsCILogEnabled()) {
std::cout << "CreateNgraphFunc" << std::endl;
}
const std::string model = model_proto.SerializeAsString();
try {
auto cnn_network = global_context.ie_core.ReadModel(model);
if ((subgraph_context.precision == InferenceEngine::Precision::FP16) &&
if ((subgraph_context.precision == "FP16") &&
(global_context.device_type.find("VPUX") == std::string::npos)) {
// FP16 transformations
ov::pass::ConvertFP32ToFP16 pass_obj;
Expand Down Expand Up @@ -88,20 +88,19 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
size_t index = results.size() - 1;

for (auto it = results.rbegin(); it != results.rend(); ++it) {
if (auto const_node = std::dynamic_pointer_cast<ngraph::op::Constant>((*it)->input_value(0).get_node_shared_ptr())) {
if (auto const_node = std::dynamic_pointer_cast<ov::op::v0::Constant>((*it)->input_value(0).get_node_shared_ptr())) {
const_outputs_map[(*it)->get_friendly_name()] = const_node;
results.erase(results.begin() + index);
}
--index;
}
}
#ifndef NDEBUG
#if defined(OPENVINO_2022_3) || (OPENVINO_2023_0)
#if defined(OPENVINO_2022_3) || (OPENVINO_2023_0) || (OPENVINO_2023_1)
if (IsDebugEnabled()) {
std::string name = cnn_network->get_friendly_name();
ov::pass::Serialize serializer(name + ".xml", name + ".bin");
serializer.run_on_model(cnn_network);
ngraph::plot_graph(cnn_network, name + "_executable" + ".dot");
}
#endif
#endif
Expand All @@ -111,31 +110,6 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext
}
}

InferenceEngine::Precision ConvertPrecisionONNXToOpenVINO(const ONNX_NAMESPACE::TypeProto& onnx_type) {
ONNX_NAMESPACE::DataType type_string = ONNX_NAMESPACE::Utils::DataTypeUtils::ToType(onnx_type);
if (*type_string == "float" || *type_string == "tensor(float)") {
return InferenceEngine::Precision::FP32;
} else if (*type_string == "float16" || *type_string == "tensor(float16)") {
return InferenceEngine::Precision::FP16;
} else if (*type_string == "int32" || *type_string == "tensor(int32)") {
return InferenceEngine::Precision::I32;
} else if (*type_string == "int16" || *type_string == "tensor(int16)") {
return InferenceEngine::Precision::I16;
} else if (*type_string == "int8" || *type_string == "tensor(int8)") {
return InferenceEngine::Precision::I8;
} else if (*type_string == "uint16" || *type_string == "tensor(uint16)") {
return InferenceEngine::Precision::U16;
} else if (*type_string == "uint8" || *type_string == "tensor(uint8)") {
return InferenceEngine::Precision::U8;
} else if (*type_string == "bool" || *type_string == "tensor(bool)") {
return InferenceEngine::Precision::U8;
} else if (*type_string == "int64" || *type_string == "tensor(int64)") {
return InferenceEngine::Precision::I32;
} else {
throw std::string(log_tag + "Unsupported Data type");
}
}

Ort::UnownedValue
GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
OVInferRequestPtr infer_request,
Expand Down Expand Up @@ -166,7 +140,7 @@ Ort::UnownedValue
GetOutputTensor(Ort::KernelContext& context,
std::string output_name,
std::unordered_map<std::string, int> output_names,
std::shared_ptr<ngraph::Node> node) {
std::shared_ptr<ov::Node> node) {
// Find position of '/' in the output_name
int pos = output_name.find("/");
// Copy the substring from start to pos
Expand Down Expand Up @@ -210,25 +184,25 @@ int GetFirstAvailableDevice(GlobalContext& global_context) {
return i;
}

void FillOutputsWithConstantData(std::shared_ptr<ngraph::Node> node, Ort::UnownedValue& out_tensor) {
void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor) {
switch (node->get_element_type()) {
case ngraph::element::Type_t::f32: {
case ov::element::Type_t::f32: {
FillOutputHelper<float>(out_tensor, node);
break;
}
case ngraph::element::Type_t::boolean: {
case ov::element::Type_t::boolean: {
FillOutputHelper<char>(out_tensor, node);
break;
}
case ngraph::element::Type_t::i32: {
case ov::element::Type_t::i32: {
FillOutputHelper<int32_t>(out_tensor, node);
break;
}
case ngraph::element::Type_t::i64: {
case ov::element::Type_t::i64: {
FillOutputHelper<int64_t>(out_tensor, node);
break;
}
case ngraph::element::Type_t::f16: {
case ov::element::Type_t::f16: {
FillOutputHelper<float>(out_tensor, node);
break;
}
Expand All @@ -237,14 +211,22 @@ void FillOutputsWithConstantData(std::shared_ptr<ngraph::Node> node, Ort::Unowne
}
}

#if defined(_MSC_VER)
#pragma warning(disable : 4127)
#endif

template <typename T>
void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ngraph::Node> node) {
auto const_node = std::dynamic_pointer_cast<ngraph::op::Constant>(node);
void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> node) {
auto const_node = std::dynamic_pointer_cast<ov::op::v0::Constant>(node);
auto res = const_node->cast_vector<T>();
T* tensor_data = out_tensor.GetTensorMutableData<T>();
std::copy(res.begin(), res.end(), tensor_data);
}

#if defined(_MSC_VER)
#pragma warning(default : 4127)
#endif

void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx,
std::string input_name, Ort::KernelContext& context,
const SubGraphContext& subgraph_context) {
Expand Down
11 changes: 4 additions & 7 deletions onnxruntime/core/providers/openvino/backend_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,16 @@ bool IsCILogEnabled();

int GetFirstAvailableDevice(GlobalContext& global_context);

void FillOutputsWithConstantData(std::shared_ptr<ngraph::Node> node, Ort::UnownedValue& out_tensor);
void FillOutputsWithConstantData(std::shared_ptr<ov::Node> node, Ort::UnownedValue& out_tensor);

template <typename T>
void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ngraph::Node> node);
void FillOutputHelper(Ort::UnownedValue& out_tensor, std::shared_ptr<ov::Node> node);

Ort::UnownedValue
GetOutputTensor(Ort::KernelContext& context,
std::string output_name,
std::unordered_map<std::string, int> output_names,
std::shared_ptr<ngraph::Node> node);

InferenceEngine::Precision
ConvertPrecisionONNXToOpenVINO(const ONNX_NAMESPACE::TypeProto& onnx_type);
std::shared_ptr<ov::Node> node);

Ort::UnownedValue
GetOutputTensor(Ort::KernelContext& context, size_t batch_size,
Expand All @@ -61,7 +58,7 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor,

std::shared_ptr<OVNetwork>
CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, const SubGraphContext& subgraph_context,
std::map<std::string, std::shared_ptr<ngraph::Node>>& const_outputs_map);
std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map);

void printPerformanceCounts(const std::vector<OVProfilingInfo>& performanceMap,
std::ostream& stream, std::string deviceName);
Expand Down
37 changes: 25 additions & 12 deletions onnxruntime/core/providers/openvino/backends/basic_backend.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

#include "core/providers/shared_library/provider_api.h"
#include "../backend_utils.h"
#include <ngraph/pass/constant_folding.hpp>
// #include <ngraph/pass/constant_folding.hpp>
#include "basic_backend.h"
#include "../backend_manager.h"

Expand Down Expand Up @@ -37,6 +37,9 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
// Setting OpenCL queue throttling for GPU
EnableGPUThrottling(device_config);

// Enable streams; default=1 unless ovverriden by user config
EnableStreams();

#ifndef NDEBUG
if (IsDebugEnabled()) {
std::string file_name = subgraph_context.subgraph_name + "_static.onnx";
Expand All @@ -45,6 +48,7 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
}
#endif
try {
std::string dev_prec = global_context.device_type + "_" + global_context_.precision_str;
if (global_context.is_wholly_supported_graph) {
#if defined(IO_BUFFER_ENABLED)
if ((global_context.device_type.find("GPU") != std::string::npos) &&
Expand All @@ -61,8 +65,8 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
}
#else
#if defined(OPENVINO_2023_0)
if (subgraph_context.precision != InferenceEngine::Precision::FP16 && global_context_.enable_dynamic_shapes == false) {
#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1)
if (!subgraph_context_.has_dynamic_input_shape && dev_prec != "CPU_FP16") {
const std::string model = model_proto.SerializeAsString();
exe_network_ = global_context_.ie_core.LoadNetwork(model, hw_target, device_config, subgraph_context_.subgraph_name);
LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin";
Expand Down Expand Up @@ -98,7 +102,7 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto,
inferRequestsQueue_ = std::unique_ptr<InferRequestsQueue>(new InferRequestsQueue(exe_network_, nireq));
}

bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ngraph::Node>>& const_outputs_map) {
bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ov::Node>>& const_outputs_map) {
if (const_outputs_map.size() == subgraph_context_.output_names.size())
subgraph_context_.is_constant = true;
if (subgraph_context_.is_constant) {
Expand All @@ -109,20 +113,23 @@ bool BasicBackend::ValidateSubgraph(std::map<std::string, std::shared_ptr<ngraph
}

void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) {
// Set inference precision if device_type != AUTO
// if (global_context_.device_type.find("GPU_FP16")!= std::string::npos){
// device_config.emplace(ov::hint::inference_precision(global_context_.precision_str));
// }
device_config = {};
// Set inference precision based on device precision for OV backend
if (global_context_.precision_str.find("FP16") != std::string::npos && global_context_.device_type == "GPU") {
device_config.emplace(ov::hint::inference_precision("f16"));
}
if (global_context_.precision_str.find("FP32") != std::string::npos) {
device_config.emplace(ov::hint::inference_precision("f32"));
}
#ifndef NDEBUG
if (openvino_ep::backend_utils::IsDebugEnabled()) {
device_config.emplace(ov::enable_profiling(true));
}
#endif
#if defined(OPENVINO_2023_0)
#if defined(OPENVINO_2023_0) || (OPENVINO_2023_1)
if (global_context_.device_type.find("VPUX") != std::string::npos) {
std::pair<std::string, ov::Any> device_property;
device_property = std::make_pair("VPUX_COMPILER_TYPE", "MLIR");
device_property = std::make_pair("VPU_COMPILER_TYPE", "MLIR");
device_config.emplace(ov::device::properties("VPUX", device_property));
}
#endif
Expand All @@ -147,10 +154,17 @@ void BasicBackend::EnableCaching() {
void BasicBackend::EnableGPUThrottling(ov::AnyMap& device_config) {
if (global_context_.enable_opencl_throttling == true && global_context_.device_type.find("GPU") != std::string::npos) {
LOGS_DEFAULT(INFO) << log_tag << "Enabled OpenCL queue throttling for GPU device";
device_config[GPU_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
std::pair<std::string, ov::Any> device_property;
device_property = std::make_pair("PLUGIN_THROTTLE", "1");
device_config.emplace(ov::device::properties("GPU_CONFIG_KEY", device_property));
// device_config[GPU_CONFIG_KEY(PLUGIN_THROTTLE)] = "1";
}
}

void BasicBackend::EnableStreams() {
global_context_.ie_core.SetStreams(global_context_.device_type, global_context_.num_streams);
}

// Starts an asynchronous inference request for data in slice indexed by batch_slice_idx on
// an Infer Request indexed by infer_req_idx
void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferRequestPtr infer_request) {
Expand All @@ -177,7 +191,6 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque
}
size_t batch_slice_idx = 0;
if (subgraph_context_.has_dynamic_input_shape &&
global_context_.enable_dynamic_shapes == true &&
(global_context_.device_type.find("CPU") != std::string::npos ||
global_context_.device_type.find("GPU") != std::string::npos)) {
auto tensor = context.GetInput(subgraph_context_.input_names.at(input_name));
Expand Down
Loading
Loading