openvinotoolkit · tkrupa-intel · Jul 1, 2024 · Jul 2, 2024 · Jul 3, 2024 · Jul 3, 2024
@@ -193,6 +193,12 @@ TEST_P(SerializationDeterministicityInputOutputTest, FromOvModel) {
     auto& expected1 = modelRef;
     ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1, irVersion).run_on_model(modelRef);
     auto expected2 = ov::test::readModel(m_out_xml_path_1, m_out_bin_path_1);
+
+    // We need to check and erase this entry because it's expected to be different in these two scenarios.
+    std::string path_key = "weights_path";
+    EXPECT_EQ(m_out_bin_path_1, expected2->get_rt_info()[path_key].as<std::string>());
+    expected2->get_rt_info().erase(path_key);
+
     ov::pass::Serialize(m_out_xml_path_2, m_out_bin_path_2, irVersion).run_on_model(expected2);
 
     EXPECT_EQ(input0Name, expected1->input(0).get_node()->get_friendly_name());
@@ -276,9 +282,22 @@ TEST_P(SerializationDeterministicityInputOutputTest, FromIrModel) {
         xmlFile.close();
     }
 
+    std::string path_key = "weights_path";
+
     auto expected1 = ov::test::readModel(xmlFileName, "");
+
+    // We need to check and erase this entry because it's expected to be different in these two scenarios.
+    EXPECT_EQ("", expected1->get_rt_info()[path_key].as<std::string>());
+    expected1->get_rt_info().erase(path_key);
+
     ov::pass::Serialize(m_out_xml_path_1, "", irVersion).run_on_model(expected1);
+
     auto expected2 = ov::test::readModel(m_out_xml_path_1, "");
+
+    // We need to check and erase this entry because it's expected to be different in these two scenarios.
+    EXPECT_EQ(m_out_bin_path_1, expected2->get_rt_info()[path_key].as<std::string>());
+    expected2->get_rt_info().erase(path_key);
+
     ov::pass::Serialize(m_out_xml_path_2, "", irVersion).run_on_model(expected2);
 
     EXPECT_EQ(input0Name, expected1->input(0).get_node()->get_friendly_name());

@@ -257,6 +257,7 @@ TEST(OvSerializationTests, SerializeRawMeta) {
 		<custom_rt_info2>
 			<item0 value="testvalue2" />
 		</custom_rt_info2>
+		<weights_path value="" />
 	</rt_info>
 </net>
 )V0G0N";

@@ -130,11 +130,12 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector<ov::Any>& variants) const
         return exts;
     };
 
-    auto create_input_model = [&]() -> std::shared_ptr<InputModel> {
+    auto create_input_model = [&](std::string& weights_path) -> std::shared_ptr<InputModel> {
         if (provided_model_stream) {
-            return std::make_shared<InputModel>(*provided_model_stream, weights, create_extensions_map());
+            return std::make_shared<InputModel>(*provided_model_stream, weights, create_extensions_map(), weights_path);
         } else if (local_model_stream.is_open()) {
-            auto input_model = std::make_shared<InputModel>(local_model_stream, weights, create_extensions_map());
+            auto input_model =
+                std::make_shared<InputModel>(local_model_stream, weights, create_extensions_map(), weights_path);
             local_model_stream.close();
             return input_model;
         }
@@ -236,7 +237,12 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector<ov::Any>& variants) const
         }
     }
 
-    return create_input_model();
+#if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
+    std::string weights_path_str = ov::util::wstring_to_string(weights_path);
+    return create_input_model(weights_path_str);
+#else
+    return create_input_model(weights_path);
+#endif
 }
 
 std::shared_ptr<ov::Model> FrontEnd::convert(const InputModel::Ptr& model) const {

@@ -205,13 +205,16 @@ class InputModel::InputModelIRImpl {
     std::unordered_map<std::string, ov::OpSet> m_opsets;
     pugi::xml_node m_root;
     pugi::xml_document m_xml_doc;
+    std::string m_weights_path;
 
 public:
     InputModelIRImpl(std::istream& stream,
                      const std::shared_ptr<ov::AlignedBuffer>& weights,
-                     const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions)
+                     const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions,
+                     std::string& weights_path)
         : m_weights(weights),
-          m_extensions(extensions) {
+          m_extensions(extensions),
+          m_weights_path(weights_path) {
         pugi::xml_parse_result res = m_xml_doc.load(stream);
         if (res.status != pugi::status_ok) {
             OPENVINO_THROW(res.description(), " at offset ", res.offset);
@@ -227,8 +230,9 @@ class InputModel::InputModelIRImpl {
 
 InputModel::InputModel(std::istream& stream,
                        const std::shared_ptr<ov::AlignedBuffer>& weights,
-                       const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions) {
-    _impl = std::make_shared<InputModelIRImpl>(stream, weights, extensions);
+                       const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions,
+                       std::string weights_path) {
+    _impl = std::make_shared<InputModelIRImpl>(stream, weights, extensions, weights_path);
 }
 
 std::shared_ptr<ov::Model> InputModel::convert() {
@@ -244,6 +248,7 @@ std::shared_ptr<ov::Model> InputModel::InputModelIRImpl::convert() {
     std::shared_ptr<ov::Model> model;
     visitor.on_attribute("net", model);
     model->get_rt_info()["version"] = int64_t(version);
+    model->get_rt_info()["weights_path"] = m_weights_path;
     parse_pre_process(m_root, m_weights, model);
 
     return model;

@@ -22,7 +22,8 @@ class InputModel : public ov::frontend::InputModel {
 public:
     InputModel(std::istream& stream,
                const std::shared_ptr<ov::AlignedBuffer>& weights,
-               const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions);
+               const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions,
+               std::string weights_path = "");
 
     std::shared_ptr<Model> convert();
 };

@@ -944,6 +944,10 @@ std::shared_ptr<ov::Node> ov::XmlDeserializer::create_node(const std::vector<ov:
         if (aw_data) {
             rtInfo["alt_width"] = aw_data.value();
         }
+        if (auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(ovNode)) {
+            rtInfo["bin_offset"] = static_cast<size_t>(pugixml::get_uint64_attr(dn, "offset"));
+            rtInfo["original_size"] = static_cast<size_t>(pugixml::get_uint64_attr(dn, "size"));
+        }
     }
 
     ovNode->set_friendly_name(params.name);

@@ -45,6 +45,8 @@ class Plugin : public ov::IPlugin {
     bool is_metric(const std::string& name) const;
     ov::Any get_metric(const std::string& name, const ov::AnyMap& arguments) const;
 
+    void set_cache_info(const std::shared_ptr<const ov::Model>& model, ExecutionConfig& properties) const;
+
 public:
     Plugin();
 

@@ -3,9 +3,13 @@
 //
 
 #pragma once
-#include "primitive.hpp"
-#include "intel_gpu/runtime/memory.hpp"
+#include <climits>
+
 #include "intel_gpu/runtime/engine.hpp"
+#include "intel_gpu/runtime/memory.hpp"
+#include "openvino/runtime/shared_buffer.hpp"
+#include "openvino/util/mmap_object.hpp"
+#include "primitive.hpp"
 
 namespace cldnn {
 
@@ -29,6 +33,10 @@ struct data : public primitive_base<data> {
     /// @note If memory is attached by memory::attach(), the attached buffer should be valid till network build.
     memory::ptr mem;
 
+    size_t original_size = SIZE_MAX;
+    size_t bin_offset = SIZE_MAX;
+    std::string weights_path = "";
+
     size_t hash() const override {
         size_t seed = primitive::hash();
         seed = hash_combine(seed, id);
@@ -46,14 +54,22 @@ struct data : public primitive_base<data> {
         size_t data_size = mem->size();
         ob << make_data(&data_size, sizeof(size_t));
 
-        if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
-            ob << make_data(mem->buffer_ptr(), data_size);
+        bool is_cache_without_weights = bin_offset != SIZE_MAX && data_size == original_size && !weights_path.empty();
+        if (is_cache_without_weights) {
+            ob << true;
+            ob << bin_offset;
+            ob << weights_path;
         } else {
-            std::vector<uint8_t> _buf;
-            _buf.resize(data_size);
-            stream* strm = reinterpret_cast<stream*>(ob.get_stream());
-            mem->copy_to(*strm, _buf.data());
-            ob << make_data(_buf.data(), data_size);
+            ob << false;
+            if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
+                ob << make_data(mem->buffer_ptr(), data_size);
+            } else {
+                std::vector<uint8_t> _buf;
+                _buf.resize(data_size);
+                stream* strm = reinterpret_cast<stream*>(ob.get_stream());
+                mem->copy_to(*strm, _buf.data());
+                ob << make_data(_buf.data(), data_size);
+            }
         }
     }
 
@@ -71,36 +87,71 @@ struct data : public primitive_base<data> {
 
         mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false);
 
+        bool is_cache_without_weights;
+        ib >> is_cache_without_weights;
+
+        std::shared_ptr<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>> shared_buf;
+        if (is_cache_without_weights) {
+            ib >> bin_offset;
+            ib >> weights_path;
+            original_size = data_size;
+
+            auto mapped_memory = ov::load_mmap_object(weights_path);
+            shared_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>>(
+                mapped_memory->data() + bin_offset,
+                data_size,
+                mapped_memory);
+        }
         if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
-            ib >> make_data(mem->buffer_ptr(), data_size);
+            if (is_cache_without_weights) {
+                std::memcpy(reinterpret_cast<uint8_t*>(mem->buffer_ptr()), shared_buf->get_ptr<uint8_t>(), data_size);
+            } else {
+                ib >> make_data(mem->buffer_ptr(), data_size);
+            }
         } else {
             const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024;
             auto& strm = ib.get_engine().get_service_stream();
             if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) {
                 std::vector<uint8_t> _buf(data_size);
-                ib >> make_data(_buf.data(), data_size);
+                if (is_cache_without_weights) {
+                    std::memcpy(reinterpret_cast<uint8_t*>(_buf.data()), shared_buf->get_ptr<uint8_t>(), data_size);
+                } else {
+                    ib >> make_data(_buf.data(), data_size);
+                }
                 mem->copy_from(strm, _buf.data());
             } else {
                 std::vector<uint8_t> _buf1(DATA_BLOCK_SIZE);
                 std::vector<uint8_t> _buf2(DATA_BLOCK_SIZE);
                 bool buf_flag = true;
                 event::ptr ev1, ev2;
                 ev1 = ev2 = nullptr;
-
                 size_t dst_offset = 0;
                 while (dst_offset < data_size) {
                     const bool is_blocking = false;
                     const size_t src_offset = 0;
-                    size_t copy_size = (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset);
+                    size_t copy_size =
+                        (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset);
                     if (buf_flag) {
-                        ib >> make_data(_buf1.data(), copy_size);
+                        if (is_cache_without_weights) {
+                            std::memcpy(reinterpret_cast<uint8_t*>(_buf1.data()),
+                                        shared_buf->get_ptr<uint8_t>() + dst_offset,
+                                        copy_size);
+                        } else {
+                            ib >> make_data(_buf1.data(), copy_size);
+                        }
                         if (ev2 != nullptr) {
                             ev2->wait();
                             ev2 = nullptr;
                         }
                         ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking);
                     } else {
-                        ib >> make_data(_buf2.data(), copy_size);
+                        if (is_cache_without_weights) {
+                            std::memcpy(reinterpret_cast<uint8_t*>(_buf2.data()),
+                                        shared_buf->get_ptr<uint8_t>() + dst_offset,
+                                        copy_size);
+                        } else {
+                            ib >> make_data(_buf2.data(), copy_size);
+                        }
                         if (ev1 != nullptr) {
                             ev1->wait();
                             ev1 = nullptr;

@@ -56,6 +56,7 @@ static constexpr Property<size_t, PropertyMutability::RW> max_dynamic_batch{"DYN
 static constexpr Property<bool, PropertyMutability::RW> nv12_two_inputs{"GPU_NV12_TWO_INPUTS"};
 static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
 static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
+static constexpr Property<std::string, PropertyMutability::RW> weights_path{"GPU_WEIGHTS_PATH"};
 
 }  // namespace intel_gpu
 }  // namespace ov

@@ -1723,8 +1723,38 @@ void program::cancel_compilation_context() {
 }
 
 void program::save(cldnn::BinaryOutputBuffer& ob) const {
+    std::string weights_path = _config.get_property(ov::intel_gpu::weights_path);
+
     std::map<cldnn::memory::ptr, std::vector<const cldnn::program_node*>> mutable_datas_ptrs;
     ob << nodes_map.size();
+
+    // Constants used as inputs of strided_slice nodes cannot be loaded from the original weights file
+    // because strided_slice undergoes transformation(s) altering their values.
+    // Setting their bin_offset fields to SIZE_MAX excludes them from weightless caching mechanism.
+    {
+        std::vector<std::string> strided_slice_data_nodes;
+        for (auto& node : nodes_map) {
+            if (node.second->is_type<strided_slice>()) {
+                auto strided_slice_node = node.second->as<strided_slice>().typed_desc();
+                if (strided_slice_node->input.size() == 4) {
+                    strided_slice_data_nodes.push_back(strided_slice_node->input[1].pid);
+                    strided_slice_data_nodes.push_back(strided_slice_node->input[2].pid);
+                    strided_slice_data_nodes.push_back(strided_slice_node->input[3].pid);
+                }
+            }
+        }
+
+        for (auto& node : nodes_map) {
+            if (node.second->is_type<data>()) {
+                auto data_node = node.second->as<data>().typed_desc();
+                if (std::find(strided_slice_data_nodes.begin(), strided_slice_data_nodes.end(), data_node->id) !=
+                    strided_slice_data_nodes.end()) {
+                    data_node->bin_offset = SIZE_MAX;
+                }
+            }
+        }
+    }
+
     for (auto& node : nodes_map) {
         ob.setKernelImplParams(node.second->get_kernel_impl_params().get());
 
@@ -1737,6 +1767,11 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const {
                 node.second->as<data>().typed_desc()->mem = data_node.get_attached_memory_ptr();
             }
         }
+
+        if (node.second->is_type<data>()) {
+            node.second->as<data>().typed_desc()->weights_path = weights_path;
+        }
+
         ob << true;
 
         ob << node.second->desc;

@@ -42,18 +42,15 @@ CompiledModel::CompiledModel(std::shared_ptr<ov::Model> model,
                              const std::shared_ptr<const ov::IPlugin>& plugin,
                              RemoteContextImpl::Ptr context,
                              const ExecutionConfig& config)
-    : ov::ICompiledModel(model,
-                         plugin,
-                         context,
-                         create_task_executor(plugin, config),
-                         nullptr)
-    , m_context(context)
-    , m_config(config)
-    , m_wait_executor(std::make_shared<ov::threading::CPUStreamsExecutor>(ov::threading::IStreamsExecutor::Config{"Intel GPU plugin wait executor"}))
-    , m_model_name(model->get_friendly_name())
-    , m_inputs(ov::ICompiledModel::inputs())
-    , m_outputs(ov::ICompiledModel::outputs())
-    , m_loaded_from_cache(false) {
+    : ov::ICompiledModel(model, plugin, context, create_task_executor(plugin, config), nullptr),
+      m_context(context),
+      m_config(config),
+      m_wait_executor(std::make_shared<ov::threading::CPUStreamsExecutor>(
+          ov::threading::IStreamsExecutor::Config{"Intel GPU plugin wait executor"})),
+      m_model_name(model->get_friendly_name()),
+      m_inputs(ov::ICompiledModel::inputs()),
+      m_outputs(ov::ICompiledModel::outputs()),
+      m_loaded_from_cache(false) {
     auto graph_base = std::make_shared<Graph>(model, m_context, m_config, 0);
     for (uint16_t n = 0; n < m_config.get_property(ov::num_streams); n++) {
         auto graph = n == 0 ? graph_base : std::make_shared<Graph>(graph_base, n);
@@ -170,7 +167,10 @@ std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_infer_request() co
 //     [ ov::Node::Input/ ov::Node::Output ]
 //     [ ov::intel_gpu::Graph ]
 void CompiledModel::export_model(std::ostream& model) const {
-    if (m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE)
+    // If ov::CacheMode::OPTIMIZE_SIZE is set, do the export iff it's possible to do weightless caching
+    // which requires the weights_path.
+    if (m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE &&
+        m_config.get_property(ov::intel_gpu::weights_path) == "")
         return;
 
     OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "CompiledModel::export_model");

diff --git a/src/plugins/intel_gpu/src/plugin/graph.cpp b/src/plugins/intel_gpu/src/plugin/graph.cpp
@@ -94,6 +94,10 @@ Graph::Graph(cldnn::BinaryInputBuffer &ib, const RemoteContextImpl::Ptr& context
         m_config.set_property(ov::intel_gpu::optimize_data(bool_prop_value));
         ib >> bool_prop_value;
         m_config.set_property(ov::intel_gpu::allow_new_shape_infer(bool_prop_value));
+
+        std::string weights_path;
+        ib >> weights_path;
+        m_config.set_property(ov::intel_gpu::weights_path(weights_path));
     }
 
     auto imported_prog = std::make_shared<cldnn::program>(get_engine(), m_config);
@@ -524,6 +528,7 @@ void Graph::export_model(cldnn::BinaryOutputBuffer &ob) {
         ob << m_config.get_property(ov::intel_gpu::partial_build_program);
         ob << m_config.get_property(ov::intel_gpu::optimize_data);
         ob << m_config.get_property(ov::intel_gpu::allow_new_shape_infer);
+        ob << m_config.get_property(ov::intel_gpu::weights_path);
     }
 
     ob.set_stream(m_network->get_stream_ptr().get());