Skip to content

Commit

Permalink
Introduce UpdateContext method in ITopologyRunner and move cuda graph…
Browse files Browse the repository at this point in the history
… capture logic to CudaGraphTopologyRunner::UpdateContext
  • Loading branch information
ytorzuk-altran committed Jun 26, 2023
1 parent 7ae6a99 commit 4e0bfec
Show file tree
Hide file tree
Showing 6 changed files with 19 additions and 17 deletions.
4 changes: 2 additions & 2 deletions modules/nvidia_plugin/src/cuda_compiled_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -259,7 +259,7 @@ std::shared_ptr<MemoryPool> CompiledModel::create_memory_pool() {

std::shared_ptr<ov::ISyncInferRequest> CompiledModel::create_benchmark_sync_infer_request() {
return std::make_shared<CudaInferRequest>(
std::static_pointer_cast<const CompiledModel>(std::shared_ptr<CompiledModel>(this, [](CompiledModel*) {})), use_cuda_graph_);
std::static_pointer_cast<const CompiledModel>(std::shared_ptr<CompiledModel>(this, [](CompiledModel*) {})));
}

std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_benchmark_infer_request() {
Expand All @@ -273,7 +273,7 @@ std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_benchmark_infer_re

std::shared_ptr<ov::ISyncInferRequest> CompiledModel::create_sync_infer_request() const {
return std::make_shared<CudaInferRequest>(
std::static_pointer_cast<const CompiledModel>(shared_from_this()), use_cuda_graph_);
std::static_pointer_cast<const CompiledModel>(shared_from_this()));
}

std::shared_ptr<ov::IAsyncInferRequest> CompiledModel::create_infer_request() const {
Expand Down
2 changes: 2 additions & 0 deletions modules/nvidia_plugin/src/cuda_eager_topology_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ namespace nvidia_gpu {

struct ITopologyRunner {
virtual void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
virtual void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const = 0;
virtual const SubGraph& GetSubGraph() const = 0;
virtual ~ITopologyRunner() = default;
};
Expand All @@ -21,6 +22,7 @@ class EagerTopologyRunner final : public SubGraph, public ITopologyRunner {
~EagerTopologyRunner() override = default;

void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override {};
const SubGraph& GetSubGraph() const override;
};

Expand Down
10 changes: 8 additions & 2 deletions modules/nvidia_plugin/src/cuda_graph_topology_runner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,14 @@ const SubGraph& CudaGraphTopologyRunner::GetSubGraph() const {
return *this;
}

void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context,
const DeviceMemBlock &memoryBlock) const {
void CudaGraphTopologyRunner::UpdateContext(InferenceRequestContext &context, const DeviceMemBlock &memoryBlock) const {
if (context.getCudaGraphContext().graphExec)
UpdateCapture(context);
else
Capture(context, memoryBlock);
}

void CudaGraphTopologyRunner::UpdateCapture(InferenceRequestContext &context) const {
CudaGraphContext& graphContext = context.getCudaGraphContext();
for (auto& pair : graphContext.parameterNodes)
pair.second.updateSrc(graphContext.graphExec.value(),
Expand Down
4 changes: 3 additions & 1 deletion modules/nvidia_plugin/src/cuda_graph_topology_runner.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,12 @@ class CudaGraphTopologyRunner final : public SubGraph, public ITopologyRunner {
~CudaGraphTopologyRunner() override = default;

void Run(const InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
void UpdateContext(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const override;
const SubGraph& GetSubGraph() const override;

private:
void Capture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const;
void UpdateCapture(InferenceRequestContext& context, const DeviceMemBlock& memoryBlock) const;
void UpdateCapture(InferenceRequestContext& context) const;
};

} // namespace nvidia_gpu
Expand Down
13 changes: 3 additions & 10 deletions modules/nvidia_plugin/src/cuda_infer_request.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,12 @@ void allocate_tensor_impl(ov::Tensor& tensor, const ov::element::Type& element_t
}
} // namespace

CudaInferRequest::CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model, bool use_cuda_graph)
CudaInferRequest::CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model)
: ov::ISyncInferRequest(compiled_model),
cancellation_token_{[this] { memory_proxy_.reset(); }},
profiler_{compiled_model->get_property(ov::enable_profiling.name()).as<bool>(),
compiled_model->get_topology_runner().GetSubGraph()},
is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as<bool>()},
use_cuda_graph_{use_cuda_graph} {
is_benchmark_mode_{compiled_model->get_property(ov::nvidia_gpu::operation_benchmark.name()).as<bool>()} {
create_infer_request();
}

Expand Down Expand Up @@ -154,13 +153,7 @@ void CudaInferRequest::start_pipeline(const ThreadContext& threadContext) {
profiler_,
cudaGraphContext,
is_benchmark_mode_};
if (use_cuda_graph_) {
auto& cuda_graph_topology_runner = dynamic_cast<const CudaGraphTopologyRunner&>(topology_runner);
if (cudaGraphContext.graphExec)
cuda_graph_topology_runner.UpdateCapture(inferRequestContext, memory);
else
cuda_graph_topology_runner.Capture(inferRequestContext, memory);
}
topology_runner.UpdateContext(inferRequestContext, memory);
topology_runner.Run(inferRequestContext, memory);
profiler_.stop_stage(Profiler::StartPipeline);
} catch (...) {
Expand Down
3 changes: 1 addition & 2 deletions modules/nvidia_plugin/src/cuda_infer_request.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class CudaInferRequest : public ov::ISyncInferRequest {
public:
using Ptr = std::shared_ptr<CudaInferRequest>;

explicit CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model, bool use_cuda_graph = false);
explicit CudaInferRequest(const std::shared_ptr<const CompiledModel>& compiled_model);
~CudaInferRequest() = default;

void infer() override;
Expand Down Expand Up @@ -65,7 +65,6 @@ class CudaInferRequest : public ov::ISyncInferRequest {
std::vector<std::shared_ptr<ov::Tensor>> input_tensors_;
std::vector<std::shared_ptr<ov::Tensor>> output_tensors_;
bool is_benchmark_mode_;
bool use_cuda_graph_;
};
// ! [infer_request:header]

Expand Down

0 comments on commit 4e0bfec

Please sign in to comment.