Introduce UpdateContext method in ITopologyRunner and move cuda graph…

… capture logic to CudaGraphTopologyRunner::UpdateContext
openvinotoolkit · Jun 26, 2023 · 4e0bfec · 4e0bfec
1 parent 7ae6a99
commit 4e0bfec
Show file tree

Hide file tree

Showing 6 changed files with 19 additions and 17 deletions.
diff --git a/modules/nvidia_plugin/src/cuda_compiled_model.cpp b/modules/nvidia_plugin/src/cuda_compiled_model.cpp
@@ -259,7 +259,7 @@ std::shared_ptr<MemoryPool> CompiledModel::create_memory_pool() {
 
 std::shared_ptr<ov::ISyncInferRequest> CompiledModel::create_benchmark_sync_infer_request() {
     return std::make_shared<CudaInferRequest>(
-        std::static_pointer_cast<const CompiledModel>(std::shared_ptr<CompiledModel>(this, [](CompiledModel*) {})), use_cuda_graph_);
+        std::static_pointer_cast<const CompiledModel>(std::shared_ptr<CompiledModel>(this, [](CompiledModel*) {})));
 }
 
 std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_benchmark_infer_request() {
@@ -273,7 +273,7 @@ std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_benchmark_infer_re
 
 std::shared_ptr<ov::ISyncInferRequest> CompiledModel::create_sync_infer_request() const {
     return std::make_shared<CudaInferRequest>(
-        std::static_pointer_cast<const CompiledModel>(shared_from_this()), use_cuda_graph_);
+        std::static_pointer_cast<const CompiledModel>(shared_from_this()));
 }
 
 std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_infer_request() const {

diff --git a/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
@@ -11,6 +11,7 @@ namespace nvidia_gpu {
 
 struct ITopologyRunner {
     virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
+    virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
     virtual const SubGraph& GetSubGraph() const = 0;
     virtual ~ITopologyRunner() = default;
 };
@@ -21,6 +22,7 @@ class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
     ~EagerTopologyRunner() override = default;
 
     void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
+    void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override {};
     const SubGraph& GetSubGraph() const override;
 };
 

diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp
@@ -39,8 +39,14 @@ const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const {
     return *this;
 }
 
-void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context,
-                                            const DeviceMemBlock &memoryBlock) const {
+void CudaGraphTopologyRunner::UpdateContext(InferenceRequestContext &context, const DeviceMemBlock &memoryBlock) const {
+    if (context.getCudaGraphContext().graphExec)
+        UpdateCapture(context);
+    else
+        Capture(context, memoryBlock);
+}
+
+void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context) const {
     CudaGraphContext& graphContext = context.getCudaGraphContext();
     for (auto& pair : graphContext.parameterNodes)
         pair.second.updateSrc(graphContext.graphExec.value(),

diff --git a/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp b/modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp
@@ -19,10 +19,12 @@ class CudaGraphTopologyRunner final : public SubGraph, public ITopologyRunner {
     ~CudaGraphTopologyRunner() override = default;
 
     void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
+    void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
     const SubGraph& GetSubGraph() const override;
 
+private:
     void Capture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const;
-    void UpdateCapture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const;
+    void UpdateCapture(InferenceRequestContext& context) const;
 };
 
 }  // namespace nvidia_gpu

diff --git a/modules/nvidia_plugin/src/cuda_infer_request.cpp b/modules/nvidia_plugin/src/cuda_infer_request.cpp
@@ -39,13 +39,12 @@ void allocate_tensor_impl(ov::Tensor& tensor, const ov::element::Type& element_t
 }
 }  // namespace
 
-CudaInferRequest::CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model, bool use_cuda_graph)
+CudaInferRequest::CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model)
     : ov::ISyncInferRequest(compiled_model),
       cancellation_token_{[this] { memory_proxy_.reset(); }},
       profiler_{compiled_model->get_property(ov::enable_profiling.name()).as<bool>(),
           compiled_model->get_topology_runner().GetSubGraph()},
-      is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as<bool>()},
-      use_cuda_graph_{use_cuda_graph} {
+      is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as<bool>()} {
     create_infer_request();
 }
 
@@ -154,13 +153,7 @@ void CudaInferRequest::start_pipeline(const ThreadContext& threadContext) {
                                                     profiler_,
                                                     cudaGraphContext,
                                                     is_benchmark_mode_};
-        if (use_cuda_graph_) {
-            auto& cuda_graph_topology_runner = dynamic_cast<const CudaGraphTopologyRunner&>(topology_runner);
-            if (cudaGraphContext.graphExec)
-                cuda_graph_topology_runner.UpdateCapture(inferRequestContext, memory);
-            else
-                cuda_graph_topology_runner.Capture(inferRequestContext, memory);
-        }
+        topology_runner.UpdateContext(inferRequestContext, memory);
         topology_runner.Run(inferRequestContext, memory);
         profiler_.stop_stage(Profiler::StartPipeline);
     } catch (...) {

diff --git a/modules/nvidia_plugin/src/cuda_infer_request.hpp b/modules/nvidia_plugin/src/cuda_infer_request.hpp
@@ -37,7 +37,7 @@ class CudaInferRequest : public ov::ISyncInferRequest  {
 public:
     using Ptr = std::shared_ptr<CudaInferRequest>;
 
-    explicit CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model, bool use_cuda_graph = false);
+    explicit CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model);
     ~CudaInferRequest() = default;
 
     void infer() override;
@@ -65,7 +65,6 @@ class CudaInferRequest : public ov::ISyncInferRequest  {
     std::vector<std::shared_ptr<ov::Tensor>> input_tensors_;
     std::vector<std::shared_ptr<ov::Tensor>> output_tensors_;
     bool is_benchmark_mode_;
-    bool use_cuda_graph_;
 };
 // ! [infer_request:header]