diff --git a/onnxruntime/core/framework/session_state_utils.cc b/onnxruntime/core/framework/session_state_utils.cc
index b13b0cd27496..72f39245d3cf 100644
--- a/onnxruntime/core/framework/session_state_utils.cc
+++ b/onnxruntime/core/framework/session_state_utils.cc
@@ -113,28 +113,14 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
   TensorShape tensor_shape = utils::GetTensorShapeFromTensorProto(tensor_proto);
   const DataTypeImpl* const type = DataTypeImpl::TensorTypeFromONNXEnum(tensor_proto.data_type())->GetElementType();
   std::unique_ptr<Tensor> p_tensor;
-  if (m != nullptr) {
-    p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
-    if (m->GetLen() < p_tensor->SizeInBytes()) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ",
-                             p_tensor->SizeInBytes(), ", Got ", m->GetLen());
-    }
-  } else {
-    if (use_device_allocator_for_initializers) {
-      void* tensor_buffer = nullptr;
-      ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer));
-      p_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, alloc);
-    } else {
-      // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
-      // (may expand it if there isn't a chunk that can be allotted to the memory request).
-      // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
-      p_tensor = std::make_unique<Tensor>(type, tensor_shape, alloc);
-    }
-  }
 
-  if (p_tensor->Location().device.Type() == OrtDevice::CPU) {
-    // deserialize directly to CPU tensor
-    if (utils::HasExternalData(tensor_proto)) {
+  auto device_type = (alloc != nullptr) ? alloc->Info().device.Type() : m->GetAllocInfo().device.Type();
+
+  if (utils::HasExternalData(tensor_proto)) {
+    if (device_type == OrtDevice::CPU) {
+      // for external initializer on CPU we will use mmap for large initializers so don't need to allocate memory in advance
+      p_tensor = std::make_unique<Tensor>(type, TensorShape(), alloc);
+
       // NB: The file containing external data for the tensor is mmap'd. If the tensor will be used on CPU we can
       // utilize the mmap'd buffer directly by calling ExtDataTensorProtoToTensor. If we called
       // TensorProtoToTensor it would copy the data, causing unnecessary overhead
@@ -143,57 +129,132 @@ static common::Status DeserializeTensorProto(const Env& env, const std::basic_st
                                                      ext_data_deleter, buffered_tensor));
 
       ExtDataValueDeleter deleter{ext_data_deleter, p_tensor.get()};
-
       MLDataType ml_tensor_type = DataTypeImpl::GetType<Tensor>();
       ort_value.Init(p_tensor.release(), ml_tensor_type, deleter);
       return common::Status::OK();
-    }
-    ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor));
-  } else {  // non-cpu tensor
-    if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
-      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
-    }
+    } else {  // non-cpu tensor
+      if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
+      }
 
-    // deserialize to CPU first for non-CPU allocator, then copy
-    std::unique_ptr<Tensor> p_deserialize_tensor;
-    if (use_device_allocator_for_initializers) {
-      void* tensor_buffer = nullptr;
-      ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, default_cpu_alloc, tensor_buffer));
-      p_deserialize_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, default_cpu_alloc);
-    } else {
-      // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
-      // (may expand it if there isn't a chunk that can be allotted to the memory request).
-      // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
-      p_deserialize_tensor = std::make_unique<Tensor>(type, tensor_shape, default_cpu_alloc);
-    }
+      // deserialize to CPU first for non-CPU allocator, then copy to device
+      // for external initializer load on non-CPU device:
+      // 1. allocate memory on device - p_tensor
+      // 2. load initializer into CPU memory - p_deserialize_tensor,
+      //    we will use mmap so no need to allocate memory on CPU in advance
+      // 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor
+      auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc);
+      if (!allocate_on_device_status.IsOK()) {
+        return allocate_on_device_status;
+      }
+
+      std::unique_ptr<Tensor> p_deserialize_tensor = std::make_unique<Tensor>(type, TensorShape(), default_cpu_alloc);
 
-    OrtCallback ext_data_deleter;
-    std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
-    if (utils::HasExternalData(tensor_proto)) {
+      OrtCallback ext_data_deleter;
+      std::optional<ScopedOrtCallbackInvoker> scoped_ort_callback_invoker;
       ORT_RETURN_IF_ERROR(ExtDataTensorProtoToTensor(env, proto_path, tensor_proto, *p_deserialize_tensor,
                                                      ext_data_deleter, buffered_tensor));
       scoped_ort_callback_invoker = ScopedOrtCallbackInvoker(ext_data_deleter);
-    } else {
-      ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor));
+      // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
+
+      return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
     }
-    // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
-
-    Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor);
-    if (!copy_status.IsOK()) {
-      if (copy_status.ErrorMessage().empty()) {
-        // The windows execution provider does not return any error message today for CopyTensor since it is
-        // not implemented yet. That's the reason we're adding our own error message so that we can debug better.
-        return Status(copy_status.Category(), copy_status.Code(),
-                      "Failed to copy tensor to " + p_tensor->Location().ToString());
+  } else {
+    // for internal initializer, always allocate memory on device - p_tensor
+    auto allocate_on_device_status = AllocateTensor(m, p_tensor, type, tensor_shape, use_device_allocator_for_initializers, alloc);
+    if (!allocate_on_device_status.IsOK()) {
+      return allocate_on_device_status;
+    }
+
+    if (device_type == OrtDevice::CPU) {
+      // deserialize directly to CPU tensor
+      ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_tensor));
+      auto ml_tensor = DataTypeImpl::GetType<Tensor>();
+      ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+      return common::Status::OK();
+    } else {  // non-cpu tensor
+      if (tensor_proto.data_type() == ONNX_NAMESPACE::TensorProto_DataType_STRING) {
+        return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "string tensor is not supported for copying between allocators");
+      }
+
+      // deserialize to CPU first for non-CPU allocator, then copy
+      // for internal initializer
+      // 1. allocate memory on CPU - p_deserialize_tensor
+      // 2. deserialize tensor_probo into a preallocated tensor (p_deserialize_tensor)
+      // 3. copy tensor from CPU to device - p_deserialize_tensor -> p_tensor
+      std::unique_ptr<Tensor> p_deserialize_tensor;
+      auto allocate_on_cpu_status = AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, default_cpu_alloc, p_deserialize_tensor);
+      if (!allocate_on_cpu_status.IsOK()) {
+        return allocate_on_cpu_status;
       }
-      return copy_status;
+
+      ORT_RETURN_IF_ERROR(utils::TensorProtoToTensor(env, proto_path.c_str(), tensor_proto, *p_deserialize_tensor));
+      // TODO!! Need a temp buffer allocator for non-escape buffers that maybe too big for stack allocation.
+
+      return CopyTensorFromCPUToDevice(data_transfer_mgr, p_deserialize_tensor, p_tensor, ort_value);
+    }
+  }
+}
+
+common::Status AllocateTensor(
+    const onnxruntime::MemBuffer* m,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    const onnxruntime::DataTypeImpl* const& type,
+    onnxruntime::TensorShape& tensor_shape,
+    bool use_device_allocator_for_initializers,
+    const onnxruntime::AllocatorPtr& alloc) {
+  if (m != nullptr) {
+    p_tensor = std::make_unique<Tensor>(type, tensor_shape, m->GetBuffer(), m->GetAllocInfo());
+    if (m->GetLen() < p_tensor->SizeInBytes()) {
+      return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Internal error. The preallocated buffer is too small. Requires ",
+                             p_tensor->SizeInBytes(), ", Got ", m->GetLen());
     }
+  } else {
+    return AllocateTensorOnDeviceOrMemory(use_device_allocator_for_initializers, tensor_shape, type, alloc, p_tensor);
   }
-  auto ml_tensor = DataTypeImpl::GetType<Tensor>();
-  ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
   return common::Status::OK();
 }
 
+common::Status AllocateTensorOnDeviceOrMemory(
+    bool use_device_allocator_for_initializers,
+    onnxruntime::TensorShape& tensor_shape,
+    const onnxruntime::DataTypeImpl* const& type,
+    const onnxruntime::AllocatorPtr& alloc,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor) {
+  if (use_device_allocator_for_initializers) {
+    void* tensor_buffer = nullptr;
+    ORT_RETURN_IF_ERROR(AllocateBufferUsingDeviceAllocatorFromShapeAndType(tensor_shape, type, alloc, tensor_buffer));
+    p_tensor = std::make_unique<Tensor>(type, tensor_shape, tensor_buffer, alloc);
+  } else {
+    // If the provided allocator is an arena-based allocator, the call to Alloc() will tap into memory from the arena
+    // (may expand it if there isn't a chunk that can be allotted to the memory request).
+    // If the provided allocator is non-arena based, the device specific Alloc() call will be used to allocate the necessary memory.
+    p_tensor = std::make_unique<Tensor>(type, tensor_shape, alloc);
+  }
+  return common::Status::OK();
+}
+
+common::Status CopyTensorFromCPUToDevice(
+    const onnxruntime::DataTransferManager& data_transfer_mgr,
+    std::unique_ptr<onnxruntime::Tensor>& p_deserialize_tensor,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    OrtValue& ort_value) {
+  Status copy_status = data_transfer_mgr.CopyTensor(*p_deserialize_tensor, *p_tensor);
+  if (!copy_status.IsOK()) {
+    if (copy_status.ErrorMessage().empty()) {
+      // The windows execution provider does not return any error message today for CopyTensor since it is
+      // not implemented yet. That's the reason we're adding our own error message so that we can debug better.
+      return Status(copy_status.Category(), copy_status.Code(),
+                    "Failed to copy tensor to " + p_tensor->Location().ToString());
+    }
+    return copy_status;
+  } else {
+    auto ml_tensor = DataTypeImpl::GetType<Tensor>();
+    ort_value.Init(p_tensor.release(), ml_tensor, ml_tensor->GetDeleteFunc());
+    return common::Status::OK();
+  }
+}
+
 common::Status SaveInitializedTensors(
     const Env& env, const std::basic_string<PATH_CHAR_TYPE>& graph_loc,
     const GraphViewer& graph, const AllocatorPtr& default_cpu_alloc,
diff --git a/onnxruntime/core/framework/session_state_utils.h b/onnxruntime/core/framework/session_state_utils.h
index 499222b6ec61..89f4f2c34006 100644
--- a/onnxruntime/core/framework/session_state_utils.h
+++ b/onnxruntime/core/framework/session_state_utils.h
@@ -50,6 +50,27 @@ common::Status SaveInitializedTensors(
     const MemoryProfileFunction& memory_profile_func,
     std::unordered_map<std::string, std::unique_ptr<Tensor>>& buffered_tensors);
 
+common::Status AllocateTensor(
+    const onnxruntime::MemBuffer* m,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    const onnxruntime::DataTypeImpl* const& type,
+    onnxruntime::TensorShape& tensor_shape,
+    bool use_device_allocator_for_initializers,
+    const onnxruntime::AllocatorPtr& alloc);
+
+common::Status AllocateTensorOnDeviceOrMemory(
+    bool use_device_allocator_for_initializers,
+    onnxruntime::TensorShape& tensor_shape,
+    const onnxruntime::DataTypeImpl* const& type,
+    const onnxruntime::AllocatorPtr& alloc,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor);
+
+common::Status CopyTensorFromCPUToDevice(
+    const onnxruntime::DataTransferManager& data_transfer_mgr,
+    std::unique_ptr<onnxruntime::Tensor>& p_deserialize_tensor,
+    std::unique_ptr<onnxruntime::Tensor>& p_tensor,
+    OrtValue& ort_value);
+
 common::Status SaveInputOutputNamesToNodeMapping(const GraphViewer& graph,
                                                  SessionState& session_state,
                                                  gsl::span<const NodeArg* const> implicit_inputs);