Merge branch 'main' of https://github.com/microsoft/onnxruntime into …

…qgemmexc
xadupre · Nov 2, 2023 · e7e8725 · e7e8725
2 parents eeb4c77 + 178f7ca
commit e7e8725
Show file tree

Hide file tree

Showing 108 changed files with 3,599 additions and 1,197 deletions.
diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json
@@ -26,7 +26,7 @@
       "component": {
         "type": "git",
         "git": {
-          "commitHash": "0c296085f9f65f0f8ef7aec7b9eed55faf37dc40",
+          "commitHash": "b86cc54efce19530fb953e4b21f57e6b3888534c",
           "repositoryUrl": "https://github.com/onnx/onnx.git"
         },
         "comments": "git submodule at cmake/external/onnx"
@@ -192,16 +192,6 @@
         "comments": "mp11"
       }
     },
-    {
-      "component": {
-        "type": "git",
-        "git": {
-          "commitHash": "6a20ba82b439ea1fd650da4d389e96b60a1dd828",
-          "repositoryUrl": "https://github.com/onnx/onnx.git"
-        },
-        "comments": "onnx"
-      }
-    },
     {
       "component": {
         "type": "git",

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
@@ -1282,14 +1282,6 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1)
   endif()
 
-  if (onnxruntime_USE_OPENVINO_VPUX_FP16)
-    add_definitions(-DOPENVINO_CONFIG_VPUX_FP16=1)
-  endif()
-
-  if (onnxruntime_USE_OPENVINO_VPUX_U8)
-    add_definitions(-DOPENVINO_CONFIG_VPUX_U8=1)
-  endif()
-
   if (onnxruntime_USE_OPENVINO_GPU_FP32_NP)
     add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
@@ -1310,16 +1302,6 @@ if (onnxruntime_USE_OPENVINO)
     add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
   endif()
 
-  if (onnxruntime_USE_OPENVINO_VPUX_FP32_NP)
-    add_definitions(-DOPENVINO_CONFIG_VPUX_FP32=1)
-    add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
-  endif()
-
-  if (onnxruntime_USE_OPENVINO_VPUX_FP16_NP)
-    add_definitions(-DOPENVINO_CONFIG_VPUX_FP16=1)
-    add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1)
-  endif()
-
   if (onnxruntime_USE_OPENVINO_HETERO)
     add_definitions(-DOPENVINO_CONFIG_HETERO=1)
     add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}")

diff --git a/cmake/deps.txt b/cmake/deps.txt
@@ -24,7 +24,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36
 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5
 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41
 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063
-onnx;https://github.com/onnx/onnx/archive/6a20ba82b439ea1fd650da4d389e96b60a1dd828.zip;179a22ad4cd67109c60031ae4b6cf2f434d8bd7e
+onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11
 #use the commit of supporting all the plugins and TRT 8.6-GA (https://github.com/onnx/onnx-tensorrt/commit/0462dc31ae78f48744b6141ae376df1f96d3f459)
 onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/a43ce67187bab219520fd80f21af8bbd4354bc8c.zip;572535aefef477050f86744dfab1fef840198035
 protobuf;https://github.com/protocolbuffers/protobuf/archive/refs/tags/v21.12.zip;7cf2733949036c7d52fda017badcab093fe73bfa

diff --git a/cmake/external/onnx b/cmake/external/onnx
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
@@ -40,6 +40,7 @@
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_slice.cc"
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reshape.cc"
         "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_expand.cc"
+        "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/collective/distributed_reduce.cc"
       )
     endif()
     # add using ONNXRUNTIME_ROOT so they show up under the 'contrib_ops' folder in Visual Studio

diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake
@@ -94,6 +94,11 @@ set(contrib_ops_excluded_files
   "cuda_contrib_kernels.h"
   "inverse.cc"
   "fused_conv.cc"
+  "bert/group_query_attention_helper.h"
+  "bert/group_query_attention.h"
+  "bert/group_query_attention.cc"
+  "bert/group_query_attention_impl.h"
+  "bert/group_query_attention_impl.cu"
 )
 
 if (NOT onnxruntime_ENABLE_ATEN)
@@ -109,6 +114,7 @@ if (NOT onnxruntime_USE_NCCL)
   list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc")
   list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc")
+  list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc")
 endif()
 
 set(provider_excluded_files

diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md
@@ -2422,14 +2422,14 @@ This version of the operator has been available since version 1 of the 'com.micr
 <dd>When buffered past_key and past_value is used (present_key uses same tensor as past_key), requiredto specify past_sequence_length (could be 0). Otherwise, past_sequence_length inferred from past_key.</dd>
 </dl>
 
-#### Outputs (1 - 3)
+#### Outputs
 
 <dl>
 <dt><tt>output</tt> : T</dt>
 <dd>3D output tensor with shape (batch_size, sequence_length, hidden_size)</dd>
-<dt><tt>present_key</tt> (optional) : T</dt>
+<dt><tt>present_key</tt> : T</dt>
 <dd>present state key with support for format BSNH or BNSH. When past_key uses same tensor as present_key(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length.</dd>
-<dt><tt>present_value</tt> (optional) : T</dt>
+<dt><tt>present_value</tt> : T</dt>
 <dd>present state value with support for format BSNH or BNSH. When past_value uses same tensor as present_value(k-v buffer), it is of length max_sequence_length... otherwise of length past_sequence_length +kv_sequence_length.</dd>
 </dl>
 

diff --git a/docs/python/ReadMeOV.rst b/docs/python/ReadMeOV.rst
@@ -7,7 +7,6 @@ OpenVINO™ Execution Provider for ONNX Runtime accelerates inference across man
  - Intel® CPUs
  - Intel® integrated GPUs
  - Intel® discrete GPUs
- - Intel® integrated VPUs
 
 Installation
 ------------
@@ -22,7 +21,6 @@ This package supports:
  - Intel® CPUs
  - Intel® integrated GPUs
  - Intel® discrete GPUs
- - Intel® integrated VPUs
 
 ``pip3 install onnxruntime-openvino``
 

diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h
@@ -611,7 +611,7 @@ typedef struct OrtMIGraphXProviderOptions {
 typedef struct OrtOpenVINOProviderOptions {
 #ifdef __cplusplus
   OrtOpenVINOProviderOptions() : device_type{},
-                                 enable_vpu_fast_compile{},
+                                 enable_npu_fast_compile{},
                                  device_id{},
                                  num_of_threads{},
                                  cache_dir{},
@@ -624,7 +624,7 @@ typedef struct OrtOpenVINOProviderOptions {
    * Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"
    */
   const char* device_type;
-  unsigned char enable_vpu_fast_compile;  ///< 0 = disabled, nonzero = enabled
+  unsigned char enable_npu_fast_compile;  ///< 0 = disabled, nonzero = enabled
   const char* device_id;
   size_t num_of_threads;  ///< 0 = Use default number of threads
   const char* cache_dir;  // path is set to empty by default
@@ -4605,6 +4605,10 @@ struct OrtCustomOp {
   OrtStatusPtr(ORT_API_CALL* KernelComputeV2)(_In_ void* op_kernel, _In_ OrtKernelContext* context);
 
   OrtStatusPtr(ORT_API_CALL* InferOutputShapeFn)(_In_ const struct OrtCustomOp* op, _In_ OrtShapeInferContext*);
+
+  // Get start range
+  int(ORT_API_CALL* GetStartVersion)(_In_ const struct OrtCustomOp* op);
+  int(ORT_API_CALL* GetEndVersion)(_In_ const struct OrtCustomOp* op);
 };
 
 /*

diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h
@@ -2228,6 +2228,8 @@ struct ShapeInferContext {
 
 using ShapeInferFn = Ort::Status (*)(Ort::ShapeInferContext&);
 
+#define MAX_CUSTOM_OP_END_VER (1UL << 31) - 1
+
 template <typename TOp, typename TKernel, bool WithStatus = false>
 struct CustomOpBase : OrtCustomOp {
   CustomOpBase() {
@@ -2280,6 +2282,14 @@ struct CustomOpBase : OrtCustomOp {
     }
 
     SetShapeInferFn<TOp>(0);
+
+    OrtCustomOp::GetStartVersion = [](const OrtCustomOp* this_) {
+      return static_cast<const TOp*>(this_)->start_ver_;
+    };
+
+    OrtCustomOp::GetEndVersion = [](const OrtCustomOp* this_) {
+      return static_cast<const TOp*>(this_)->end_ver_;
+    };
   }
 
   // Default implementation of GetExecutionProviderType that returns nullptr to default to the CPU provider
@@ -2348,6 +2358,9 @@ struct CustomOpBase : OrtCustomOp {
  protected:
   // Helper function that returns a map of session config entries specified by CustomOpBase::GetSessionConfigKeys.
   void GetSessionConfigs(std::unordered_map<std::string, std::string>& out, ConstSessionOptions options) const;
+
+  int start_ver_ = 1;
+  int end_ver_ = MAX_CUSTOM_OP_END_VER;
 };
 
 }  // namespace Ort

diff --git a/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h b/include/onnxruntime/core/session/onnxruntime_lite_custom_op.h
@@ -773,8 +773,11 @@ struct OrtLiteCustomOp : public OrtCustomOp {
   PARSE_ARGS(Ort::Float8E5M2FNUZ_t, ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT8E5M2FNUZ)
 
   OrtLiteCustomOp(const char* op_name,
-                  const char* execution_provider) : op_name_(op_name),
-                                                    execution_provider_(execution_provider) {
+                  const char* execution_provider,
+                  int start_ver = 1, int end_ver = MAX_CUSTOM_OP_END_VER) : op_name_(op_name),
+                                                                            execution_provider_(execution_provider),
+                                                                            start_ver_(start_ver),
+                                                                            end_ver_(end_ver) {
     OrtCustomOp::version = ORT_API_VERSION;
 
     OrtCustomOp::GetName = [](const OrtCustomOp* op) { return static_cast<const OrtLiteCustomOp*>(op)->op_name_.c_str(); };
@@ -837,13 +840,26 @@ struct OrtLiteCustomOp : public OrtCustomOp {
     OrtCustomOp::KernelCompute = {};
 
     OrtCustomOp::InferOutputShapeFn = {};
+
+    OrtCustomOp::GetStartVersion = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->start_ver_;
+    };
+
+    OrtCustomOp::GetEndVersion = [](const OrtCustomOp* op) {
+      auto self = reinterpret_cast<const OrtLiteCustomOp*>(op);
+      return self->end_ver_;
+    };
   }
 
   const std::string op_name_;
   const std::string execution_provider_;
 
   std::vector<ONNXTensorElementDataType> input_types_;
   std::vector<ONNXTensorElementDataType> output_types_;
+
+  int start_ver_ = 1;
+  int end_ver_ = MAX_CUSTOM_OP_END_VER;
 };
 
 //////////////////////////// OrtLiteCustomFunc ////////////////////////////////
@@ -873,9 +889,11 @@ struct OrtLiteCustomFunc : public OrtLiteCustomOp {
   OrtLiteCustomFunc(const char* op_name,
                     const char* execution_provider,
                     ComputeFn compute_fn,
-                    ShapeInferFn shape_infer_fn = {}) : OrtLiteCustomOp(op_name, execution_provider),
-                                                        compute_fn_(compute_fn),
-                                                        shape_infer_fn_(shape_infer_fn) {
+                    ShapeInferFn shape_infer_fn = {},
+                    int start_ver = 1,
+                    int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, start_ver, end_ver),
+                                                           compute_fn_(compute_fn),
+                                                           shape_infer_fn_(shape_infer_fn) {
     ParseArgs<Args...>(input_types_, output_types_);
 
     OrtCustomOp::KernelCompute = [](void* op_kernel, OrtKernelContext* context) {
@@ -911,9 +929,11 @@ struct OrtLiteCustomFunc : public OrtLiteCustomOp {
   OrtLiteCustomFunc(const char* op_name,
                     const char* execution_provider,
                     ComputeFnReturnStatus compute_fn_return_status,
-                    ShapeInferFn shape_infer_fn = {}) : OrtLiteCustomOp(op_name, execution_provider),
-                                                        compute_fn_return_status_(compute_fn_return_status),
-                                                        shape_infer_fn_(shape_infer_fn) {
+                    ShapeInferFn shape_infer_fn = {},
+                    int start_ver = 1,
+                    int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, start_ver, end_ver),
+                                                           compute_fn_return_status_(compute_fn_return_status),
+                                                           shape_infer_fn_(shape_infer_fn) {
     ParseArgs<Args...>(input_types_, output_types_);
 
     OrtCustomOp::KernelComputeV2 = [](void* op_kernel, OrtKernelContext* context) -> OrtStatusPtr {
@@ -985,8 +1005,9 @@ struct OrtLiteCustomStruct : public OrtLiteCustomOp {
   };
 
   OrtLiteCustomStruct(const char* op_name,
-                      const char* execution_provider) : OrtLiteCustomOp(op_name,
-                                                                        execution_provider) {
+                      const char* execution_provider,
+                      int start_ver = 1,
+                      int end_ver = MAX_CUSTOM_OP_END_VER) : OrtLiteCustomOp(op_name, execution_provider, start_ver, end_ver) {
     SetCompute(&CustomOp::Compute);
 
     OrtCustomOp::CreateKernel = [](const OrtCustomOp* this_, const OrtApi* ort_api, const OrtKernelInfo* info) {
@@ -1049,25 +1070,31 @@ template <typename... Args>
 OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
                                     const char* execution_provider,
                                     void (*custom_compute_fn)(Args...),
-                                    Status (*shape_infer_fn)(ShapeInferContext&) = {}) {
+                                    Status (*shape_infer_fn)(ShapeInferContext&) = {},
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
   using LiteOp = OrtLiteCustomFunc<Args...>;
-  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn, shape_infer_fn).release();
+  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn, shape_infer_fn, start_ver, end_ver).release();
 }
 
 template <typename... Args>
 OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
                                     const char* execution_provider,
                                     Status (*custom_compute_fn_v2)(Args...),
-                                    Status (*shape_infer_fn)(ShapeInferContext&) = {}) {
+                                    Status (*shape_infer_fn)(ShapeInferContext&) = {},
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
   using LiteOp = OrtLiteCustomFunc<Args...>;
-  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn_v2, shape_infer_fn).release();
+  return std::make_unique<LiteOp>(op_name, execution_provider, custom_compute_fn_v2, shape_infer_fn, start_ver, end_ver).release();
 }
 
 template <typename CustomOp>
 OrtLiteCustomOp* CreateLiteCustomOp(const char* op_name,
-                                    const char* execution_provider) {
+                                    const char* execution_provider,
+                                    int start_ver = 1,
+                                    int end_ver = MAX_CUSTOM_OP_END_VER) {
   using LiteOp = OrtLiteCustomStruct<CustomOp>;
-  return std::make_unique<LiteOp>(op_name, execution_provider).release();
+  return std::make_unique<LiteOp>(op_name, execution_provider, start_ver, end_ver).release();
 }
 
 }  // namespace Custom

diff --git a/js/web/docs/webgpu-operators.md b/js/web/docs/webgpu-operators.md
@@ -40,6 +40,7 @@ Do not modify directly.*
 | Expand | ai.onnx(8-12,13+) |  |
 | Flatten | ai.onnx(1-8,9-10,11-12,13+) |  |
 | Floor | ai.onnx(6-12,13+) |  |
+| FusedConv | com.microsoft(1+) |  |
 | Gather | ai.onnx(1-10,11-12,13+) |  |
 | GatherElements | ai.onnx(11-12,13+) |  |
 | Gelu | com.microsoft(1+) |  |

diff --git a/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts b/js/web/lib/wasm/jsep/webgpu/op-resolve-rules.ts
@@ -67,6 +67,7 @@ export const WEBGPU_OP_RESOLVE_RULES: Map<string, OperatorImplementation> = new
   ['Exp', [unaryOps.exp]],
   ['Expand', [expand]],
   ['Floor', [unaryOps.floor]],
+  ['FusedConv', [conv, parseConvAttributes]],
   ['Gather', [gather, parseGatherAttributes]],
   ['GatherElements', [gatherElements, parseGatherElementsAttributes]],
   ['Gelu', [unaryOps.gelu]],

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/activation_util.ts
@@ -41,12 +41,12 @@ export const activationFnSnippet =
       if (!activation) {
         return '';
       }
-
       // TODO: add implementations
       return '';
     };
 
 export const biasActivationSnippet = (hasBias: boolean, activation?: Activation): string => `
       ${hasBias ? 'value = value + getBiasByOutputCoords(coords);' : ''}
-      ${activation ? 'value = activation(value, coords);' : ''}
+      // TODO uncomment the following line when activation is supported above.
+      // ${activation ? 'value = activation(value, coords);' : ''}
       `;
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -242,8 +242,9 @@ export const createConv2DMatMulProgramInfo =
         ${declareFunctions}
         ${
             conv2dCommonSnippet(
-                isChannelsLast, fitAOuter, fitBOuter, fitInner, hasBias, undefined, false, elementsSize[0],
-                elementsSize[1], elementsSize[2], t)}
+                isChannelsLast, fitAOuter, fitBOuter, fitInner, hasBias,
+                attributes.activation.toLowerCase() as Activation, false, elementsSize[0], elementsSize[1],
+                elementsSize[2], t)}
             ${
             isVec4 ?
                 makeMatMulPackedVec4Source(elementsPerThread, workGroupSize, t, undefined, !isChannelsLast, tileInner) :

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv_backprop_mm_webgpu.ts
@@ -236,7 +236,9 @@ export const createConv2DTransposeMatMulProgramInfo =
         const dimBOuter : i32 = ${dimBOuter};
         const dimInner : i32 = ${dimInner};
         ${declareFunctions}
-        ${conv2dTransposeCommonSnippet(isChannelsLast, hasBias, undefined, false, innerElementSize)}
+        ${
+            conv2dTransposeCommonSnippet(
+                isChannelsLast, hasBias, attributes.activation.toLowerCase() as Activation, false, innerElementSize)}
         ${
             isVec4 ? makeMatMulPackedVec4Source(
                          elementsPerThread, workGroupSize, 'f32', undefined, !isChannelsLast, tileInner) :

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -23,7 +23,7 @@ import {TensorView} from '../../../tensor-view';
 import {ShapeUtil} from '../../../util';
 import {ProgramInfo} from '../../types';
 import {getBroadcastDims, IndicesHelper, inputVariable, outputVariable, ShaderHelper, tensorTypeToWsglStorageType} from '../common';
-import {getActicationSnippet, InternalActivationAttributes} from '../fuse-utils';
+import {getActivationSnippet, InternalActivationAttributes} from '../fuse-utils';
 
 import {typeSnippet} from './activation_util';
 
@@ -440,7 +440,7 @@ export const createMatmulProgramInfo =
       const dimInner = aShape[aShape.length - 1];
       const dimBOuter = bShape[bShape.length - 1];
       const isVec4 = dimInner % 4 === 0 && dimBOuter % 4 === 0;
-      const {activationFunction, applyActivation} = getActicationSnippet(activationAttributes);
+      const {activationFunction, applyActivation} = getActivationSnippet(activationAttributes, isVec4);
 
       // TODO: fine tune size
       const elementsPerThread = dimAOuter <= 8 ? [4, 1, 1] : [4, 4, 1];
@@ -473,8 +473,8 @@ export const createMatmulProgramInfo =
   const dimBOuter: i32 = ${dimBOuter};
   const dimInner: i32 = ${dimInner};
   ${shaderHelper.declareVariables(...inputVariables, output)}
-  ${declareFunctions}
   ${activationFunction}
+  ${declareFunctions}
   ${
           isVec4 ? makeMatMulPackedVec4Source(elementsPerThread, workgroupSize, dataType, batchDims) :
                    makeMatMulPackedSource(elementsPerThread, workgroupSize, dataType, batchDims)}