Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[GPU] Weightless caching #25731

Open
wants to merge 25 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
9167c82
[WIP] Add bin offset to cldnn::data and propagate it when constructin…
tkrupa-intel Jul 1, 2024
210d609
[CONTINUED] Add bin offset to cldnn::data and propagate it when const…
tkrupa-intel Jul 2, 2024
d812f3e
Propagate weights path to CompiledModel
tkrupa-intel Jul 3, 2024
d046d40
Fix casting
tkrupa-intel Jul 3, 2024
2633df7
Propagate weights path all the way to export function and load it fro…
tkrupa-intel Jul 3, 2024
669cff1
Add actual weightless caching
tkrupa-intel Jul 4, 2024
2c5cf4e
More weights loading ideas
tkrupa-intel Jul 5, 2024
f440407
Simplify and save both versions to aux files
tkrupa-intel Jul 10, 2024
c1f2867
Comment out dumping to aux files
tkrupa-intel Jul 10, 2024
fd49b1e
Prevent weightless caching when constant size is changed during trans…
tkrupa-intel Aug 13, 2024
35bd53d
Propagate weights path
tkrupa-intel Aug 13, 2024
61c1496
Fix wstring default values
tkrupa-intel Aug 30, 2024
ac3a068
Propagate weights_path as std::string unconditionally
tkrupa-intel Sep 2, 2024
1b1abbc
Fix lvalue error
tkrupa-intel Sep 2, 2024
81a9ff3
Fix propagating weights path data to all serialized constants
tkrupa-intel Sep 4, 2024
247fe60
Add weights path to expected output of rt_info_serialization test
tkrupa-intel Sep 4, 2024
02aa4a1
Adjust for weights path in serialization deterministicity tests
tkrupa-intel Sep 4, 2024
b6e58df
Exclude constants used as strided_slice inputs from weightless caching
tkrupa-intel Sep 9, 2024
49d64cc
Disable weightless caching instead of throwing error when weights_pat…
tkrupa-intel Sep 10, 2024
2c8c07f
Remove unused variable
tkrupa-intel Sep 13, 2024
cc489a8
Move saving and loading of weights_path from program.cpp to graph.cpp
tkrupa-intel Sep 13, 2024
691ad30
Enable weightless caching iff ov::CacheMode::OPTIMIZE_SIZE is set
tkrupa-intel Sep 13, 2024
5c3b493
Move load/store of weights_path to fix import_model() when CacheMode …
tkrupa-intel Sep 20, 2024
20eabb9
Construct mmap object only once
tkrupa-intel Sep 20, 2024
f512ff6
Remove unused code
tkrupa-intel Oct 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions src/core/tests/pass/serialization/deterministicity.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,12 @@ TEST_P(SerializationDeterministicityInputOutputTest, FromOvModel) {
auto& expected1 = modelRef;
ov::pass::Serialize(m_out_xml_path_1, m_out_bin_path_1, irVersion).run_on_model(modelRef);
auto expected2 = ov::test::readModel(m_out_xml_path_1, m_out_bin_path_1);

// We need to check and erase this entry because it's expected to be different in these two scenarios.
std::string path_key = "weights_path";
EXPECT_EQ(m_out_bin_path_1, expected2->get_rt_info()[path_key].as<std::string>());
expected2->get_rt_info().erase(path_key);

ov::pass::Serialize(m_out_xml_path_2, m_out_bin_path_2, irVersion).run_on_model(expected2);

EXPECT_EQ(input0Name, expected1->input(0).get_node()->get_friendly_name());
Expand Down Expand Up @@ -276,9 +282,22 @@ TEST_P(SerializationDeterministicityInputOutputTest, FromIrModel) {
xmlFile.close();
}

std::string path_key = "weights_path";

auto expected1 = ov::test::readModel(xmlFileName, "");

// We need to check and erase this entry because it's expected to be different in these two scenarios.
EXPECT_EQ("", expected1->get_rt_info()[path_key].as<std::string>());
expected1->get_rt_info().erase(path_key);

ov::pass::Serialize(m_out_xml_path_1, "", irVersion).run_on_model(expected1);

auto expected2 = ov::test::readModel(m_out_xml_path_1, "");

// We need to check and erase this entry because it's expected to be different in these two scenarios.
EXPECT_EQ(m_out_bin_path_1, expected2->get_rt_info()[path_key].as<std::string>());
expected2->get_rt_info().erase(path_key);

ov::pass::Serialize(m_out_xml_path_2, "", irVersion).run_on_model(expected2);

EXPECT_EQ(input0Name, expected1->input(0).get_node()->get_friendly_name());
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ TEST(OvSerializationTests, SerializeRawMeta) {
<custom_rt_info2>
<item0 value="testvalue2" />
</custom_rt_info2>
<weights_path value="" />
</rt_info>
</net>
)V0G0N";
Expand Down
14 changes: 10 additions & 4 deletions src/frontends/ir/src/frontend.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,12 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector<ov::Any>& variants) const
return exts;
};

auto create_input_model = [&]() -> std::shared_ptr<InputModel> {
auto create_input_model = [&](std::string& weights_path) -> std::shared_ptr<InputModel> {
if (provided_model_stream) {
return std::make_shared<InputModel>(*provided_model_stream, weights, create_extensions_map());
return std::make_shared<InputModel>(*provided_model_stream, weights, create_extensions_map(), weights_path);
} else if (local_model_stream.is_open()) {
auto input_model = std::make_shared<InputModel>(local_model_stream, weights, create_extensions_map());
auto input_model =
std::make_shared<InputModel>(local_model_stream, weights, create_extensions_map(), weights_path);
local_model_stream.close();
return input_model;
}
Expand Down Expand Up @@ -236,7 +237,12 @@ InputModel::Ptr FrontEnd::load_impl(const std::vector<ov::Any>& variants) const
}
}

return create_input_model();
#if defined(OPENVINO_ENABLE_UNICODE_PATH_SUPPORT) && defined(_WIN32)
std::string weights_path_str = ov::util::wstring_to_string(weights_path);
return create_input_model(weights_path_str);
#else
return create_input_model(weights_path);
#endif
}

std::shared_ptr<ov::Model> FrontEnd::convert(const InputModel::Ptr& model) const {
Expand Down
13 changes: 9 additions & 4 deletions src/frontends/ir/src/input_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -205,13 +205,16 @@ class InputModel::InputModelIRImpl {
std::unordered_map<std::string, ov::OpSet> m_opsets;
pugi::xml_node m_root;
pugi::xml_document m_xml_doc;
std::string m_weights_path;

public:
InputModelIRImpl(std::istream& stream,
const std::shared_ptr<ov::AlignedBuffer>& weights,
const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions)
const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions,
std::string& weights_path)
: m_weights(weights),
m_extensions(extensions) {
m_extensions(extensions),
m_weights_path(weights_path) {
pugi::xml_parse_result res = m_xml_doc.load(stream);
if (res.status != pugi::status_ok) {
OPENVINO_THROW(res.description(), " at offset ", res.offset);
Expand All @@ -227,8 +230,9 @@ class InputModel::InputModelIRImpl {

InputModel::InputModel(std::istream& stream,
const std::shared_ptr<ov::AlignedBuffer>& weights,
const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions) {
_impl = std::make_shared<InputModelIRImpl>(stream, weights, extensions);
const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions,
std::string weights_path) {
_impl = std::make_shared<InputModelIRImpl>(stream, weights, extensions, weights_path);
}

std::shared_ptr<ov::Model> InputModel::convert() {
Expand All @@ -244,6 +248,7 @@ std::shared_ptr<ov::Model> InputModel::InputModelIRImpl::convert() {
std::shared_ptr<ov::Model> model;
visitor.on_attribute("net", model);
model->get_rt_info()["version"] = int64_t(version);
model->get_rt_info()["weights_path"] = m_weights_path;
parse_pre_process(m_root, m_weights, model);

return model;
Expand Down
3 changes: 2 additions & 1 deletion src/frontends/ir/src/input_model.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,8 @@ class InputModel : public ov::frontend::InputModel {
public:
InputModel(std::istream& stream,
const std::shared_ptr<ov::AlignedBuffer>& weights,
const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions);
const std::unordered_map<ov::DiscreteTypeInfo, ov::BaseOpExtension::Ptr>& extensions,
std::string weights_path = "");

std::shared_ptr<Model> convert();
};
Expand Down
4 changes: 4 additions & 0 deletions src/frontends/ir/src/ir_deserializer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -944,6 +944,10 @@ std::shared_ptr<ov::Node> ov::XmlDeserializer::create_node(const std::vector<ov:
if (aw_data) {
rtInfo["alt_width"] = aw_data.value();
}
if (auto constant = std::dynamic_pointer_cast<ov::op::v0::Constant>(ovNode)) {
rtInfo["bin_offset"] = static_cast<size_t>(pugixml::get_uint64_attr(dn, "offset"));
rtInfo["original_size"] = static_cast<size_t>(pugixml::get_uint64_attr(dn, "size"));
}
}

ovNode->set_friendly_name(params.name);
Expand Down
2 changes: 2 additions & 0 deletions src/plugins/intel_gpu/include/intel_gpu/plugin/plugin.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ class Plugin : public ov::IPlugin {
bool is_metric(const std::string& name) const;
ov::Any get_metric(const std::string& name, const ov::AnyMap& arguments) const;

void set_cache_info(const std::shared_ptr<const ov::Model>& model, ExecutionConfig& properties) const;

public:
Plugin();

Expand Down
81 changes: 66 additions & 15 deletions src/plugins/intel_gpu/include/intel_gpu/primitives/data.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
//

#pragma once
#include "primitive.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include <climits>

#include "intel_gpu/runtime/engine.hpp"
#include "intel_gpu/runtime/memory.hpp"
#include "openvino/runtime/shared_buffer.hpp"
#include "openvino/util/mmap_object.hpp"
#include "primitive.hpp"

namespace cldnn {

Expand All @@ -29,6 +33,10 @@ struct data : public primitive_base<data> {
/// @note If memory is attached by memory::attach(), the attached buffer should be valid till network build.
memory::ptr mem;

size_t original_size = SIZE_MAX;
size_t bin_offset = SIZE_MAX;
std::string weights_path = "";

size_t hash() const override {
size_t seed = primitive::hash();
seed = hash_combine(seed, id);
Expand All @@ -46,14 +54,22 @@ struct data : public primitive_base<data> {
size_t data_size = mem->size();
ob << make_data(&data_size, sizeof(size_t));

if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
ob << make_data(mem->buffer_ptr(), data_size);
bool is_cache_without_weights = bin_offset != SIZE_MAX && data_size == original_size && !weights_path.empty();
if (is_cache_without_weights) {
ob << true;
ob << bin_offset;
ob << weights_path;
} else {
std::vector<uint8_t> _buf;
_buf.resize(data_size);
stream* strm = reinterpret_cast<stream*>(ob.get_stream());
mem->copy_to(*strm, _buf.data());
ob << make_data(_buf.data(), data_size);
ob << false;
if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
ob << make_data(mem->buffer_ptr(), data_size);
} else {
std::vector<uint8_t> _buf;
_buf.resize(data_size);
stream* strm = reinterpret_cast<stream*>(ob.get_stream());
mem->copy_to(*strm, _buf.data());
ob << make_data(_buf.data(), data_size);
}
}
}

Expand All @@ -71,36 +87,71 @@ struct data : public primitive_base<data> {

mem = ib.get_engine().allocate_memory(output_layout, _allocation_type, false);

bool is_cache_without_weights;
ib >> is_cache_without_weights;

std::shared_ptr<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>> shared_buf;
if (is_cache_without_weights) {
ib >> bin_offset;
ib >> weights_path;
original_size = data_size;

auto mapped_memory = ov::load_mmap_object(weights_path);
tkrupa-intel marked this conversation as resolved.
Show resolved Hide resolved
shared_buf = std::make_shared<ov::SharedBuffer<std::shared_ptr<ov::MappedMemory>>>(
mapped_memory->data() + bin_offset,
data_size,
mapped_memory);
}
if (_allocation_type == allocation_type::usm_host || _allocation_type == allocation_type::usm_shared) {
ib >> make_data(mem->buffer_ptr(), data_size);
if (is_cache_without_weights) {
std::memcpy(reinterpret_cast<uint8_t*>(mem->buffer_ptr()), shared_buf->get_ptr<uint8_t>(), data_size);
} else {
ib >> make_data(mem->buffer_ptr(), data_size);
}
} else {
const size_t DATA_BLOCK_SIZE = 2 * 1024 * 1024;
auto& strm = ib.get_engine().get_service_stream();
if (data_size < DATA_BLOCK_SIZE || output_layout.format.is_image_2d()) {
std::vector<uint8_t> _buf(data_size);
ib >> make_data(_buf.data(), data_size);
if (is_cache_without_weights) {
std::memcpy(reinterpret_cast<uint8_t*>(_buf.data()), shared_buf->get_ptr<uint8_t>(), data_size);
} else {
ib >> make_data(_buf.data(), data_size);
}
mem->copy_from(strm, _buf.data());
} else {
std::vector<uint8_t> _buf1(DATA_BLOCK_SIZE);
std::vector<uint8_t> _buf2(DATA_BLOCK_SIZE);
bool buf_flag = true;
event::ptr ev1, ev2;
ev1 = ev2 = nullptr;

size_t dst_offset = 0;
while (dst_offset < data_size) {
const bool is_blocking = false;
const size_t src_offset = 0;
size_t copy_size = (data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset);
size_t copy_size =
(data_size > (dst_offset + DATA_BLOCK_SIZE)) ? DATA_BLOCK_SIZE : (data_size - dst_offset);
if (buf_flag) {
ib >> make_data(_buf1.data(), copy_size);
if (is_cache_without_weights) {
std::memcpy(reinterpret_cast<uint8_t*>(_buf1.data()),
shared_buf->get_ptr<uint8_t>() + dst_offset,
copy_size);
} else {
ib >> make_data(_buf1.data(), copy_size);
}
if (ev2 != nullptr) {
ev2->wait();
ev2 = nullptr;
}
ev1 = mem->copy_from(strm, _buf1.data(), src_offset, dst_offset, copy_size, is_blocking);
} else {
ib >> make_data(_buf2.data(), copy_size);
if (is_cache_without_weights) {
std::memcpy(reinterpret_cast<uint8_t*>(_buf2.data()),
shared_buf->get_ptr<uint8_t>() + dst_offset,
copy_size);
} else {
ib >> make_data(_buf2.data(), copy_size);
}
if (ev1 != nullptr) {
ev1->wait();
ev1 = nullptr;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ static constexpr Property<size_t, PropertyMutability::RW> max_dynamic_batch{"DYN
static constexpr Property<bool, PropertyMutability::RW> nv12_two_inputs{"GPU_NV12_TWO_INPUTS"};
static constexpr Property<float, PropertyMutability::RW> buffers_preallocation_ratio{"GPU_BUFFERS_PREALLOCATION_RATIO"};
static constexpr Property<size_t, PropertyMutability::RW> max_kernels_per_batch{"GPU_MAX_KERNELS_PER_BATCH"};
static constexpr Property<std::string, PropertyMutability::RW> weights_path{"GPU_WEIGHTS_PATH"};

} // namespace intel_gpu
} // namespace ov
Expand Down
35 changes: 35 additions & 0 deletions src/plugins/intel_gpu/src/graph/program.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1723,8 +1723,38 @@ void program::cancel_compilation_context() {
}

void program::save(cldnn::BinaryOutputBuffer& ob) const {
std::string weights_path = _config.get_property(ov::intel_gpu::weights_path);

std::map<cldnn::memory::ptr, std::vector<const cldnn::program_node*>> mutable_datas_ptrs;
ob << nodes_map.size();

// Constants used as inputs of strided_slice nodes cannot be loaded from the original weights file
// because strided_slice undergoes transformation(s) altering their values.
// Setting their bin_offset fields to SIZE_MAX excludes them from weightless caching mechanism.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned above, we need more robust way to track if Constant op (and data primitive later) still has original values.

{
std::vector<std::string> strided_slice_data_nodes;
for (auto& node : nodes_map) {
if (node.second->is_type<strided_slice>()) {
auto strided_slice_node = node.second->as<strided_slice>().typed_desc();
if (strided_slice_node->input.size() == 4) {
strided_slice_data_nodes.push_back(strided_slice_node->input[1].pid);
strided_slice_data_nodes.push_back(strided_slice_node->input[2].pid);
strided_slice_data_nodes.push_back(strided_slice_node->input[3].pid);
}
}
}

for (auto& node : nodes_map) {
if (node.second->is_type<data>()) {
auto data_node = node.second->as<data>().typed_desc();
if (std::find(strided_slice_data_nodes.begin(), strided_slice_data_nodes.end(), data_node->id) !=
strided_slice_data_nodes.end()) {
data_node->bin_offset = SIZE_MAX;
}
}
}
}

for (auto& node : nodes_map) {
ob.setKernelImplParams(node.second->get_kernel_impl_params().get());

Expand All @@ -1737,6 +1767,11 @@ void program::save(cldnn::BinaryOutputBuffer& ob) const {
node.second->as<data>().typed_desc()->mem = data_node.get_attached_memory_ptr();
}
}

if (node.second->is_type<data>()) {
node.second->as<data>().typed_desc()->weights_path = weights_path;
}

ob << true;

ob << node.second->desc;
Expand Down
26 changes: 13 additions & 13 deletions src/plugins/intel_gpu/src/plugin/compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,18 +42,15 @@ CompiledModel::CompiledModel(std::shared_ptr<ov::Model> model,
const std::shared_ptr<const ov::IPlugin>& plugin,
RemoteContextImpl::Ptr context,
const ExecutionConfig& config)
: ov::ICompiledModel(model,
plugin,
context,
create_task_executor(plugin, config),
nullptr)
, m_context(context)
, m_config(config)
, m_wait_executor(std::make_shared<ov::threading::CPUStreamsExecutor>(ov::threading::IStreamsExecutor::Config{"Intel GPU plugin wait executor"}))
, m_model_name(model->get_friendly_name())
, m_inputs(ov::ICompiledModel::inputs())
, m_outputs(ov::ICompiledModel::outputs())
, m_loaded_from_cache(false) {
: ov::ICompiledModel(model, plugin, context, create_task_executor(plugin, config), nullptr),
m_context(context),
m_config(config),
m_wait_executor(std::make_shared<ov::threading::CPUStreamsExecutor>(
ov::threading::IStreamsExecutor::Config{"Intel GPU plugin wait executor"})),
m_model_name(model->get_friendly_name()),
m_inputs(ov::ICompiledModel::inputs()),
m_outputs(ov::ICompiledModel::outputs()),
m_loaded_from_cache(false) {
auto graph_base = std::make_shared<Graph>(model, m_context, m_config, 0);
for (uint16_t n = 0; n < m_config.get_property(ov::num_streams); n++) {
auto graph = n == 0 ? graph_base : std::make_shared<Graph>(graph_base, n);
Expand Down Expand Up @@ -170,7 +167,10 @@ std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_infer_request() co
// [ ov::Node::Input/ ov::Node::Output ]
// [ ov::intel_gpu::Graph ]
void CompiledModel::export_model(std::ostream& model) const {
if (m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE)
// If ov::CacheMode::OPTIMIZE_SIZE is set, do the export iff it's possible to do weightless caching
// which requires the weights_path.
if (m_config.get_property(ov::cache_mode) == ov::CacheMode::OPTIMIZE_SIZE &&
m_config.get_property(ov::intel_gpu::weights_path) == "")
return;

OV_ITT_SCOPED_TASK(itt::domains::intel_gpu_plugin, "CompiledModel::export_model");
Expand Down
5 changes: 5 additions & 0 deletions src/plugins/intel_gpu/src/plugin/graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,10 @@ Graph::Graph(cldnn::BinaryInputBuffer &ib, const RemoteContextImpl::Ptr& context
m_config.set_property(ov::intel_gpu::optimize_data(bool_prop_value));
ib >> bool_prop_value;
m_config.set_property(ov::intel_gpu::allow_new_shape_infer(bool_prop_value));

std::string weights_path;
ib >> weights_path;
m_config.set_property(ov::intel_gpu::weights_path(weights_path));
}

auto imported_prog = std::make_shared<cldnn::program>(get_engine(), m_config);
Expand Down Expand Up @@ -524,6 +528,7 @@ void Graph::export_model(cldnn::BinaryOutputBuffer &ob) {
ob << m_config.get_property(ov::intel_gpu::partial_build_program);
ob << m_config.get_property(ov::intel_gpu::optimize_data);
ob << m_config.get_property(ov::intel_gpu::allow_new_shape_infer);
ob << m_config.get_property(ov::intel_gpu::weights_path);
}

ob.set_stream(m_network->get_stream_ptr().get());
Expand Down
Loading
Loading