diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index 3aa98bb020452..84c651d015cea 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -3630,6 +3630,7 @@ struct OrtApi { * "profiling_level": QNN profiling level, options: "off", "basic", "detailed". Default to off. * "profiling_file_path": QNN profiling file path if ETW not enabled. * "rpc_control_latency": QNN RPC control latency. + * "rpc_polling_time": QNN RPC polling time. * "vtcm_mb": QNN VTCM size in MB. default to 0(not set). * "htp_performance_mode": QNN performance mode, options: "burst", "balanced", "default", "high_performance", * "high_power_saver", "low_balanced", "extreme_power_saver", "low_power_saver", "power_saver", "sustained_high_performance". Default to "default". diff --git a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h index 9942f8c656760..6596e2dc24c74 100644 --- a/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h +++ b/include/onnxruntime/core/session/onnxruntime_run_options_config_keys.h @@ -43,6 +43,9 @@ static const char* const kOrtRunOptionsConfigQnnPerfModePostRun = "qnn.htp_perf_ // Set RPC control latency for QNN HTP backend static const char* const kOrtRunOptionsConfigQnnRpcControlLatency = "qnn.rpc_control_latency"; +// Set RPC polling time for QNN HTP backend, only supported by v69 or higher +static const char* const kOrtRunOptionsConfigQnnRpcPollingTime = "qnn.rpc_polling_time"; + // Set graph annotation id for CUDA EP. Use with enable_cuda_graph=true. // The value should be an integer. If the value is not set, the default value is 0 and // ORT session only captures one cuda graph before another capture is requested. diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index 0ad83dd4ba504..588147d79a648 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -890,33 +890,38 @@ Status QnnBackendManager::SetHtpPowerConfig(uint32_t htp_power_config_client_id, } Status QnnBackendManager::SetRpcControlLatency(uint32_t htp_power_config_client_id, - uint32_t rpc_control_latency) { + uint32_t rpc_control_latency, + uint32_t rpc_polling_time) { + QnnDevice_Infrastructure_t qnn_device_infra = nullptr; + auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); + ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); + + auto* htp_infra = static_cast(qnn_device_infra); + ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType, + "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); + QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; + + // Set rpc control latency here + constexpr int kNumRpcPollingPowerConfigs = 2; + std::vector rpc_power_configs(kNumRpcPollingPowerConfigs); + QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0]; + rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; + if (rpc_control_latency != 0) { - QnnDevice_Infrastructure_t qnn_device_infra = nullptr; - auto status = qnn_interface_.deviceGetInfrastructure(&qnn_device_infra); - ORT_RETURN_IF(QNN_SUCCESS != status, "backendGetPerfInfrastructure failed."); - - auto* htp_infra = static_cast(qnn_device_infra); - ORT_RETURN_IF(QNN_HTP_DEVICE_INFRASTRUCTURE_TYPE_PERF != htp_infra->infraType, - "HTP infra type = ", htp_infra->infraType, ", which is not perf infra type."); - QnnHtpDevice_PerfInfrastructure_t& htp_perf_infra = htp_infra->perfInfra; - - // Set rpc control latency here, but note that v68 doesn't support rpc polling mode. - constexpr int kNumRpcPollingPowerConfigs = 2; - std::vector rpc_power_configs(kNumRpcPollingPowerConfigs); - QnnHtpPerfInfrastructure_PowerConfig_t& rpc_control_latency_cfg = rpc_power_configs[0]; - // v68 doesn't support this. - QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time = rpc_power_configs[1]; - rpc_control_latency_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_CONTROL_LATENCY; rpc_control_latency_cfg.rpcControlLatencyConfig = rpc_control_latency; - rpc_polling_time.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; - rpc_polling_time.rpcPollingTimeConfig = 9999; - std::vector perf_power_configs_ptr = - ObtainNullTermPtrVector(rpc_power_configs); - status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data()); - ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency."); } + // only v69 or higher support rpc polling mode. + QnnHtpPerfInfrastructure_PowerConfig_t& rpc_polling_time_cfg = rpc_power_configs[1]; + rpc_polling_time_cfg.option = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIGOPTION_RPC_POLLING_TIME; + if (rpc_polling_time > 0) { + rpc_polling_time_cfg.rpcPollingTimeConfig = rpc_polling_time; + } + std::vector perf_power_configs_ptr = + ObtainNullTermPtrVector(rpc_power_configs); + status = htp_perf_infra.setPowerConfig(htp_power_config_client_id, perf_power_configs_ptr.data()); + ORT_RETURN_IF(QNN_SUCCESS != status, "setPowerConfig failed for RPC control latency or polling time."); + return Status::OK(); } diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h index b80f1374fcdc7..7c2f11f3d6337 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.h @@ -101,7 +101,8 @@ class QnnBackendManager { HtpPerformanceMode htp_performance_mode); Status SetRpcControlLatency(uint32_t htp_power_config_client_id, - uint32_t rpc_control_latency); + uint32_t rpc_control_latency, + uint32_t rpc_polling_time); const QNN_INTERFACE_VER_TYPE& GetQnnInterface() { return qnn_interface_; } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc index 24132b98e3757..3c36a9ceebd43 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.cc +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.cc @@ -303,6 +303,17 @@ QNNExecutionProvider::QNNExecutionProvider(const ProviderOptions& provider_optio LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << default_rpc_control_latency_; } + static const std::string RPC_POLLING_TIME = "rpc_polling_time"; + auto polling_time_pos = provider_options_map.find(RPC_POLLING_TIME); + if (polling_time_pos != provider_options_map.end()) { + default_rpc_polling_time_ = static_cast(std::stoul(polling_time_pos->second)); + if (default_rpc_polling_time_ > QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME) { + LOGS_DEFAULT(WARNING) << "rpc_polling_time exceed the max limit, use the limit instead: " << QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME; + default_rpc_polling_time_ = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME; + } + LOGS_DEFAULT(VERBOSE) << "rpc_polling_time: " << default_rpc_polling_time_; + } + // default_htp_performance_mode from QNN EP option. // set it once only for each thread as default so user don't need to set it for every session run static const std::string HTP_PERFORMANCE_MODE = "htp_performance_mode"; @@ -984,7 +995,8 @@ QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager* uint32_t device_id, uint32_t core_id, qnn::HtpPerformanceMode default_htp_performance_mode, - uint32_t default_rpc_control_latency) + uint32_t default_rpc_control_latency, + uint32_t default_rpc_polling_time) : qnn_backend_manager_(qnn_backend_manager) { Status rt = qnn_backend_manager_->CreateHtpPowerCfgId(device_id, core_id, htp_power_config_id_); is_htp_power_config_id_valid_ = rt.IsOK(); @@ -995,9 +1007,10 @@ QNNExecutionProvider::PerThreadContext::PerThreadContext(qnn::QnnBackendManager* ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetHtpPowerConfig(htp_power_config_id_, default_htp_performance_mode)); } - if (default_rpc_control_latency > 0) { + if (default_rpc_control_latency > 0 || default_rpc_polling_time > 0) { ORT_IGNORE_RETURN_VALUE(qnn_backend_manager_->SetRpcControlLatency(htp_power_config_id_, - default_rpc_control_latency)); + default_rpc_control_latency, + default_rpc_polling_time)); } } } @@ -1028,7 +1041,9 @@ QNNExecutionProvider::PerThreadContext& QNNExecutionProvider::GetPerThreadContex if (context_state_.retired_context_pool.empty()) { uint32_t core_id = 0; context = std::make_shared(qnn_backend_manager_.get(), device_id_, core_id, - default_htp_performance_mode_, default_rpc_control_latency_); + default_htp_performance_mode_, + default_rpc_control_latency_, + default_rpc_polling_time_); } else { context = context_state_.retired_context_pool.back(); context_state_.retired_context_pool.pop_back(); @@ -1081,7 +1096,18 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio uint32_t rpc_control_latency = 0; if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, rpc_latency)) { rpc_control_latency = static_cast(std::stoul(rpc_latency)); - LOGS_DEFAULT(VERBOSE) << "rpc_control_latency: " << rpc_control_latency; + LOGS_DEFAULT(VERBOSE) << kOrtRunOptionsConfigQnnRpcControlLatency << rpc_control_latency; + } + + std::string rpc_polling = ""; + uint32_t rpc_polling_time = 0; + if (run_options.config_options.TryGetConfigEntry(kOrtRunOptionsConfigQnnRpcPollingTime, rpc_polling)) { + rpc_polling_time = static_cast(std::stoul(rpc_polling)); + if (rpc_polling_time > QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME) { + LOGS_DEFAULT(WARNING) << "rpc_polling_time exceed the max limit, use the limit instead: " << QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME; + rpc_polling_time = QNN_HTP_PERF_INFRASTRUCTURE_POWER_CONFIG_MAX_RPC_POLLING_TIME; + } + LOGS_DEFAULT(VERBOSE) << kOrtRunOptionsConfigQnnRpcControlLatency << rpc_polling_time; } if (GetPerThreadContext().IsHtpPowerConfigIdValid()) { @@ -1090,9 +1116,10 @@ Status QNNExecutionProvider::OnRunStart(const onnxruntime::RunOptions& run_optio htp_performance_mode)); } - if (rpc_control_latency > 0) { + if (rpc_control_latency > 0 || rpc_polling_time > 0) { ORT_RETURN_IF_ERROR(qnn_backend_manager_->SetRpcControlLatency(GetPerThreadContext().GetHtpPowerConfigId(), - rpc_control_latency)); + rpc_control_latency, + rpc_polling_time)); } } diff --git a/onnxruntime/core/providers/qnn/qnn_execution_provider.h b/onnxruntime/core/providers/qnn/qnn_execution_provider.h index e0eaf31c94a36..cc3b21bc5aadb 100644 --- a/onnxruntime/core/providers/qnn/qnn_execution_provider.h +++ b/onnxruntime/core/providers/qnn/qnn_execution_provider.h @@ -148,6 +148,7 @@ class QNNExecutionProvider : public IExecutionProvider { uint32_t device_id_ = 0; qnn::HtpPerformanceMode default_htp_performance_mode_ = qnn::HtpPerformanceMode::kHtpDefault; uint32_t default_rpc_control_latency_ = 0; + uint32_t default_rpc_polling_time_ = 0; bool enable_HTP_FP16_precision_ = true; bool share_ep_contexts_ = false; #ifdef _WIN32 @@ -159,7 +160,8 @@ class QNNExecutionProvider : public IExecutionProvider { PerThreadContext(qnn::QnnBackendManager* qnn_backend_manager, uint32_t device_id, uint32_t core_id, qnn::HtpPerformanceMode default_htp_performance_mode, - uint32_t default_rpc_control_latency); + uint32_t default_rpc_control_latency, + uint32_t default_rpc_polling_time); ~PerThreadContext(); ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(PerThreadContext); diff --git a/onnxruntime/test/providers/qnn/qnn_basic_test.cc b/onnxruntime/test/providers/qnn/qnn_basic_test.cc index c4367aeb52edc..a6be6714ff5be 100644 --- a/onnxruntime/test/providers/qnn/qnn_basic_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_basic_test.cc @@ -640,6 +640,7 @@ TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) { #else options["backend_path"] = "libQnnHtp.so"; #endif + options["rpc_polling_time"] = "8888"; auto qnn_ep = QnnExecutionProviderWithOptions(options, &session_opts); EXPECT_TRUE(session_obj.RegisterExecutionProvider(std::move(qnn_ep)).IsOK()); @@ -666,6 +667,10 @@ TEST_F(QnnHTPBackendTests, MultithreadHtpPowerCfgSessionRunOption) { ASSERT_TRUE(rt.IsOK()); rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnPerfModePostRun, perf_modes[post_i].c_str()); ASSERT_TRUE(rt.IsOK()); + rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnRpcControlLatency, "1200"); + ASSERT_TRUE(rt.IsOK()); + rt = run_opts.config_options.AddConfigEntry(kOrtRunOptionsConfigQnnRpcPollingTime, "1000"); + ASSERT_TRUE(rt.IsOK()); threads.push_back(std::thread(RunSessionAndVerify, std::ref(session_obj), run_opts, model->builder.feeds_, model->builder.output_names_,