diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc index b13b0cd27496..72f39245d3cf 100644 --- a/onnxruntime/core/framework/session_state_utils.cc +++ b/onnxruntime/core/framework/session_state_utils.cc @@ -113,28 +113,14 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st TensorShape tensor_shape = utils::GetTensorShapeFromTensorProto(tensor_proto); const DataTypeImpl* const type = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType(); std::unique_ptr p_tensor; - if (m != nullptr) { - p_tensor = std::make_unique(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo()); - if (m->GetLen() < p_tensor->SizeInBytes()) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ", - p_tensor->SizeInBytes(), ", Got ", m->GetLen()); - } - } else { - if (use_device_allocator_for_initializers) { - void* tensor_buffer = nullptr; - ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer)); - p_tensor = std::make_unique(type, tensor_shape, tensor_buffer, alloc); - } else { - // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena - // (may expand it if there isn't a chunk that can be allotted to the memory request). - // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory. - p_tensor = std::make_unique(type, tensor_shape, alloc); - } - } - if (p_tensor->Location().device.Type() == OrtDevice::CPU) { - // deserialize directly to CPU tensor - if (utils::HasExternalData(tensor_proto)) { + auto device_type = (alloc != nullptr) ? alloc->Info().device.Type() : m->GetAllocInfo().device.Type(); + + if (utils::HasExternalData(tensor_proto)) { + if (device_type == OrtDevice::CPU) { + // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance + p_tensor = std::make_unique(type, TensorShape(), alloc); + // NB: The file containing external data for the tensor is mmap'd. If the tensor will be used on CPU we can // utilize the mmap'd buffer directly by calling ExtDataTensorProtoToTensor. If we called // TensorProtoToTensor it would copy the data, causing unnecessary overhead @@ -143,57 +129,132 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st ext_data_deleter, buffered_tensor)); ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()}; - MLDataType ml_tensor_type = DataTypeImpl::GetType(); ort_value.Init(p_tensor.release(), ml_tensor_type, deleter); return common::Status::OK(); - } - ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor)); - } else { // non-cpu tensor - if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators"); - } + } else { // non-cpu tensor + if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators"); + } - // deserialize to CPU first for non-CPU allocator, then copy - std::unique_ptr p_deserialize_tensor; - if (use_device_allocator_for_initializers) { - void* tensor_buffer = nullptr; - ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, default_cpu_alloc, tensor_buffer)); - p_deserialize_tensor = std::make_unique(type, tensor_shape, tensor_buffer, default_cpu_alloc); - } else { - // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena - // (may expand it if there isn't a chunk that can be allotted to the memory request). - // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory. - p_deserialize_tensor = std::make_unique(type, tensor_shape, default_cpu_alloc); - } + // deserialize to CPU first for non-CPU allocator, then copy to device + // for external initializer load on non-CPU device: + // 1. allocate memory on device - p_tensor + // 2. load initializer into CPU memory - p_deserialize_tensor, + // we will use mmap so no need to allocate memory on CPU in advance + // 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor + auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc); + if (!allocate_on_device_status.IsOK()) { + return allocate_on_device_status; + } + + std::unique_ptr p_deserialize_tensor = std::make_unique(type, TensorShape(), default_cpu_alloc); - OrtCallback ext_data_deleter; - std::optional scoped_ort_callback_invoker; - if (utils::HasExternalData(tensor_proto)) { + OrtCallback ext_data_deleter; + std::optional scoped_ort_callback_invoker; ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor, ext_data_deleter, buffered_tensor)); scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter); - } else { - ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor)); + // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation. + + return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value); } - // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation. - - Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor); - if (!copy_status.IsOK()) { - if (copy_status.ErrorMessage().empty()) { - // The windows execution provider does not return any error message today for CopyTensor since it is - // not implemented yet. That's the reason we're adding our own error message so that we can debug better. - return Status(copy_status.Category(), copy_status.Code(), - "Failed to copy tensor to " + p_tensor->Location().ToString()); + } else { + // for internal initializer, always allocate memory on device - p_tensor + auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc); + if (!allocate_on_device_status.IsOK()) { + return allocate_on_device_status; + } + + if (device_type == OrtDevice::CPU) { + // deserialize directly to CPU tensor + ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor)); + auto ml_tensor = DataTypeImpl::GetType(); + ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()); + return common::Status::OK(); + } else { // non-cpu tensor + if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators"); + } + + // deserialize to CPU first for non-CPU allocator, then copy + // for internal initializer + // 1. allocate memory on CPU - p_deserialize_tensor + // 2. deserialize tensor_probo into a preallocated tensor (p_deserialize_tensor) + // 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor + std::unique_ptr p_deserialize_tensor; + auto allocate_on_cpu_status = AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, default_cpu_alloc, p_deserialize_tensor); + if (!allocate_on_cpu_status.IsOK()) { + return allocate_on_cpu_status; } - return copy_status; + + ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor)); + // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation. + + return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value); + } + } +} + +common::Status AllocateTensor( + const onnxruntime::MemBuffer* m, + std::unique_ptr& p_tensor, + const onnxruntime::DataTypeImpl* const& type, + onnxruntime::TensorShape& tensor_shape, + bool use_device_allocator_for_initializers, + const onnxruntime::AllocatorPtr& alloc) { + if (m != nullptr) { + p_tensor = std::make_unique(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo()); + if (m->GetLen() < p_tensor->SizeInBytes()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ", + p_tensor->SizeInBytes(), ", Got ", m->GetLen()); } + } else { + return AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, alloc, p_tensor); } - auto ml_tensor = DataTypeImpl::GetType(); - ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()); return common::Status::OK(); } +common::Status AllocateTensorOnDeviceOrMemory( + bool use_device_allocator_for_initializers, + onnxruntime::TensorShape& tensor_shape, + const onnxruntime::DataTypeImpl* const& type, + const onnxruntime::AllocatorPtr& alloc, + std::unique_ptr& p_tensor) { + if (use_device_allocator_for_initializers) { + void* tensor_buffer = nullptr; + ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer)); + p_tensor = std::make_unique(type, tensor_shape, tensor_buffer, alloc); + } else { + // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena + // (may expand it if there isn't a chunk that can be allotted to the memory request). + // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory. + p_tensor = std::make_unique(type, tensor_shape, alloc); + } + return common::Status::OK(); +} + +common::Status CopyTensorFromCPUToDevice( + const onnxruntime::DataTransferManager& data_transfer_mgr, + std::unique_ptr& p_deserialize_tensor, + std::unique_ptr& p_tensor, + OrtValue& ort_value) { + Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor); + if (!copy_status.IsOK()) { + if (copy_status.ErrorMessage().empty()) { + // The windows execution provider does not return any error message today for CopyTensor since it is + // not implemented yet. That's the reason we're adding our own error message so that we can debug better. + return Status(copy_status.Category(), copy_status.Code(), + "Failed to copy tensor to " + p_tensor->Location().ToString()); + } + return copy_status; + } else { + auto ml_tensor = DataTypeImpl::GetType(); + ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc()); + return common::Status::OK(); + } +} + common::Status SaveInitializedTensors( const Env& env, const std::basic_string& graph_loc, const GraphViewer& graph, const AllocatorPtr& default_cpu_alloc, diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h index 499222b6ec61..89f4f2c34006 100644 --- a/onnxruntime/core/framework/session_state_utils.h +++ b/onnxruntime/core/framework/session_state_utils.h @@ -50,6 +50,27 @@ common::Status SaveInitializedTensors( const MemoryProfileFunction& memory_profile_func, std::unordered_map>& buffered_tensors); +common::Status AllocateTensor( + const onnxruntime::MemBuffer* m, + std::unique_ptr& p_tensor, + const onnxruntime::DataTypeImpl* const& type, + onnxruntime::TensorShape& tensor_shape, + bool use_device_allocator_for_initializers, + const onnxruntime::AllocatorPtr& alloc); + +common::Status AllocateTensorOnDeviceOrMemory( + bool use_device_allocator_for_initializers, + onnxruntime::TensorShape& tensor_shape, + const onnxruntime::DataTypeImpl* const& type, + const onnxruntime::AllocatorPtr& alloc, + std::unique_ptr& p_tensor); + +common::Status CopyTensorFromCPUToDevice( + const onnxruntime::DataTransferManager& data_transfer_mgr, + std::unique_ptr& p_deserialize_tensor, + std::unique_ptr& p_tensor, + OrtValue& ort_value); + common::Status SaveInputOutputNamesToNodeMapping(const GraphViewer& graph, SessionState& session_state, gsl::span implicit_inputs);