From 4e0bfec60768a623517748ec18ba913b3ce6bba4 Mon Sep 17 00:00:00 2001 From: Yaroslav Torziuk Date: Mon, 26 Jun 2023 13:15:01 +0200 Subject: [PATCH] Introduce UpdateContext method in ITopologyRunner and move cuda graph capture logic to CudaGraphTopologyRunner::UpdateContext --- modules/nvidia_plugin/src/cuda_compiled_model.cpp | 4 ++-- .../src/cuda_eager_topology_runner.hpp | 2 ++ .../src/cuda_graph_topology_runner.cpp | 10 ++++++++-- .../src/cuda_graph_topology_runner.hpp | 4 +++- modules/nvidia_plugin/src/cuda_infer_request.cpp | 13 +++---------- modules/nvidia_plugin/src/cuda_infer_request.hpp | 3 +-- 6 files changed, 19 insertions(+), 17 deletions(-) diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.cpp b/modules/nvidia_plugin/src/cuda_compiled_model.cpp index 8d87f126ca..612636716c 100644 --- a/modules/nvidia_plugin/src/cuda_compiled_model.cpp +++ b/modules/nvidia_plugin/src/cuda_compiled_model.cpp @@ -259,7 +259,7 @@ std::shared_ptr CompiledModel::create_memory_pool() { std::shared_ptr CompiledModel::create_benchmark_sync_infer_request() { return std::make_shared( - std::static_pointer_cast(std::shared_ptr(this, [](CompiledModel*) {})), use_cuda_graph_); + std::static_pointer_cast(std::shared_ptr(this, [](CompiledModel*) {}))); } std::shared_ptr CompiledModel::create_benchmark_infer_request() { @@ -273,7 +273,7 @@ std::shared_ptr CompiledModel::create_benchmark_infer_re std::shared_ptr CompiledModel::create_sync_infer_request() const { return std::make_shared( - std::static_pointer_cast(shared_from_this()), use_cuda_graph_); + std::static_pointer_cast(shared_from_this())); } std::shared_ptr CompiledModel::create_infer_request() const { diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp index 6f26481cf4..8b795cee41 100644 --- a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp @@ -11,6 +11,7 @@ namespace nvidia_gpu { struct ITopologyRunner { virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; + virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0; virtual const SubGraph& GetSubGraph() const = 0; virtual ~ITopologyRunner() = default; }; @@ -21,6 +22,7 @@ class EagerTopologyRunner final : public SubGraph, public ITopologyRunner { ~EagerTopologyRunner() override = default; void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override {}; const SubGraph& GetSubGraph() const override; }; diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp index 1b112589fc..fe66d7fc1f 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp @@ -39,8 +39,14 @@ const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const { return *this; } -void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context, - const DeviceMemBlock &memoryBlock) const { +void CudaGraphTopologyRunner::UpdateContext(InferenceRequestContext &context, const DeviceMemBlock &memoryBlock) const { + if (context.getCudaGraphContext().graphExec) + UpdateCapture(context); + else + Capture(context, memoryBlock); +} + +void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context) const { CudaGraphContext& graphContext = context.getCudaGraphContext(); for (auto& pair : graphContext.parameterNodes) pair.second.updateSrc(graphContext.graphExec.value(), diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp index 1e96834942..a58dcef6f4 100644 --- a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp +++ b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp @@ -19,10 +19,12 @@ class CudaGraphTopologyRunner final : public SubGraph, public ITopologyRunner { ~CudaGraphTopologyRunner() override = default; void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; + void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override; const SubGraph& GetSubGraph() const override; +private: void Capture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const; - void UpdateCapture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const; + void UpdateCapture(InferenceRequestContext& context) const; }; } // namespace nvidia_gpu diff --git a/modules/nvidia_plugin/src/cuda_infer_request.cpp b/modules/nvidia_plugin/src/cuda_infer_request.cpp index 1b0e4146a8..9f17998a1c 100644 --- a/modules/nvidia_plugin/src/cuda_infer_request.cpp +++ b/modules/nvidia_plugin/src/cuda_infer_request.cpp @@ -39,13 +39,12 @@ void allocate_tensor_impl(ov::Tensor& tensor, const ov::element::Type& element_t } } // namespace -CudaInferRequest::CudaInferRequest(const std::shared_ptr& compiled_model, bool use_cuda_graph) +CudaInferRequest::CudaInferRequest(const std::shared_ptr& compiled_model) : ov::ISyncInferRequest(compiled_model), cancellation_token_{[this] { memory_proxy_.reset(); }}, profiler_{compiled_model->get_property(ov::enable_profiling.name()).as(), compiled_model->get_topology_runner().GetSubGraph()}, - is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as()}, - use_cuda_graph_{use_cuda_graph} { + is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as()} { create_infer_request(); } @@ -154,13 +153,7 @@ void CudaInferRequest::start_pipeline(const ThreadContext& threadContext) { profiler_, cudaGraphContext, is_benchmark_mode_}; - if (use_cuda_graph_) { - auto& cuda_graph_topology_runner = dynamic_cast(topology_runner); - if (cudaGraphContext.graphExec) - cuda_graph_topology_runner.UpdateCapture(inferRequestContext, memory); - else - cuda_graph_topology_runner.Capture(inferRequestContext, memory); - } + topology_runner.UpdateContext(inferRequestContext, memory); topology_runner.Run(inferRequestContext, memory); profiler_.stop_stage(Profiler::StartPipeline); } catch (...) { diff --git a/modules/nvidia_plugin/src/cuda_infer_request.hpp b/modules/nvidia_plugin/src/cuda_infer_request.hpp index 1639fc1eb5..83f427ce55 100644 --- a/modules/nvidia_plugin/src/cuda_infer_request.hpp +++ b/modules/nvidia_plugin/src/cuda_infer_request.hpp @@ -37,7 +37,7 @@ class CudaInferRequest : public ov::ISyncInferRequest { public: using Ptr = std::shared_ptr; - explicit CudaInferRequest(const std::shared_ptr& compiled_model, bool use_cuda_graph = false); + explicit CudaInferRequest(const std::shared_ptr& compiled_model); ~CudaInferRequest() = default; void infer() override; @@ -65,7 +65,6 @@ class CudaInferRequest : public ov::ISyncInferRequest { std::vector> input_tensors_; std::vector> output_tensors_; bool is_benchmark_mode_; - bool use_cuda_graph_; }; // ! [infer_request:header]