From 27a6890529b7bfc379d5bc655632ec320e309ff7 Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Fri, 23 Aug 2024 13:56:07 +0800
Subject: [PATCH] [js/webgpu] Optimize conv1d by conv2d (#19388)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

### Description
<!-- Describe your changes. -->

Optimize conv1d to go to the conv2d path to utilize the conv2d's
optimization path.

See whisper-tiny-encoder model becomes 158.66 ms from 532.28 ms. Conv
goes to Conv2DMatMul(8 ms) instead of GroupedConv(382 ms).

Old profiling result:
Kernel | Time (ms) | Percentage (%)
-- | -- | --
Conv\|GroupedConv | 382.99 | 71.95
MatMul | 126.16 | 23.70
Softmax | 7.01 | 1.32
Transpose | 4.59 | 0.86
Add | 4.39 | 0.82
Mul | 2.36 | 0.44
Div | 1.44 | 0.27
ReduceMean\|ReduceMeanShared | 1.25 | 0.23
Erf | 0.85 | 0.16
Sub | 0.72 | 0.14
Pow | 0.46 | 0.09
Sqrt | 0.07 | 0.01
Sum | 532.28 |  

New profiling result with this PR:

Kernel | Time (ms) | Percentage (%)
-- | -- | --
MatMul | 127.07 | 80.09
Conv\|Conv2DMatMul | 8.00 | 5.04
Softmax | 6.95 | 4.38
Transpose | 4.65 | 2.93
Add | 4.26 | 2.68
Mul | 2.56 | 1.61
Div | 1.51 | 0.95
ReduceMean\|ReduceMeanShared | 1.31 | 0.83
Erf | 0.85 | 0.54
Sub | 0.79 | 0.50
Pow | 0.46 | 0.29
Conv\|Transpose | 0.26 | 0.17
Sqrt | 0.00 | 0.00
Sum | 158.66 |  

---------

Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
---
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  | 12 ++--
 .../ops/3rd-party/matmul_packed_webgpu.ts     | 22 +++---
 .../lib/wasm/jsep/webgpu/ops/conv-grouped.ts  |  8 ++-
 js/web/lib/wasm/jsep/webgpu/ops/conv.ts       | 54 ++++++++++-----
 js/web/lib/wasm/jsep/webgpu/ops/matmul.ts     | 12 ++--
 js/web/lib/wasm/jsep/webgpu/ops/transpose.ts  |  6 +-
 js/web/test/data/ops/conv1d.jsonc             | 69 +++++++++++++++++++
 js/web/test/suite-test-list.jsonc             |  1 +
 8 files changed, 141 insertions(+), 43 deletions(-)
 create mode 100644 js/web/test/data/ops/conv1d.jsonc

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 7884a3cd1a68..3ef5c943d562 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -182,6 +182,7 @@ export const createConv2DMatMulProgramInfo = (
   dimInner: number,
   hasBias: boolean,
   sequentialAccessByThreads: boolean,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const isChannelsLast = attributes.format === 'NHWC';
   const inChannels = isChannelsLast ? inputs[0].dims[3] : inputs[0].dims[1];
@@ -309,13 +310,16 @@ export const createConv2DMatMulProgramInfo = (
   return {
     name: 'Conv2DMatMul',
     shaderCache: {
-      hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${
-        tileAOuter
-      };${tileBOuter};${tileInner}`,
+      hint: `${attributes.cacheKey};${innerElementSize};${isVec4};${fitAOuter};${fitBOuter};${fitInner};${tileAOuter};${tileBOuter};${tileInner}`,
       inputDependencies,
     },
     getRunData: () => ({
-      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
       dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
       programUniforms,
     }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
index f9bc015055c9..f0287529ca08 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/matmul_packed_webgpu.ts
@@ -110,13 +110,9 @@ export const makeMatMulPackedVec4Source = (
       workPerThread[0] === 4
     )
   ) {
-    throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${
-      innerElementSize
-    } and workPerThread[1] ${workPerThread[1]} must be 4.
+    throw new Error(`If transposeA ${transposeA} is true, innerElementSize ${innerElementSize} and workPerThread[1] ${workPerThread[1]} must be 4.
       Otherwise, innerElementSize ${innerElementSize} must be 3 or 4.
-  tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${
-    tileInner
-  } must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${workPerThread[0]} must be 4.`);
+  tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}. tileInner ${tileInner} must be divisible by workgroupSize[1] ${workgroupSize[1]}. colPerThread ${workPerThread[0]} must be 4.`);
   }
   return `
 var<workgroup> mm_Asub: array<array<vec${innerElementSize}<${type}>, ${tileAWidth / innerElementSize}>, ${tileAHight}>;
@@ -227,11 +223,7 @@ export const makeMatMulPackedSource = (
     !(tileAHight % workgroupSize[1] === 0 && tileAWidth % workgroupSize[0] === 0 && tileInner % workgroupSize[1] === 0)
   ) {
     throw new Error(
-      `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${
-        workgroupSize[1]
-      }, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${
-        workgroupSize[0]
-      }, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
+      `tileAHight ${tileAHight} must be divisible by workgroupSize[1]${workgroupSize[1]}, tileAWidth ${tileAWidth} must be divisible by workgroupSize[0]${workgroupSize[0]}, tileInner ${tileInner} must be divisible by workgroupSize[1]${workgroupSize[1]}`,
     );
   }
   const rowPerThreadA = tileAHight / workgroupSize[1];
@@ -470,6 +462,7 @@ export const createMatmulProgramInfo = (
   outputShape: readonly number[],
   reshapedOutputShape?: readonly number[],
   isChannelsLast = false /* only used for conv2dByMatMul*/,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const aShape = inputs[0].dims;
   const bShape = inputs[1].dims;
@@ -562,7 +555,12 @@ export const createMatmulProgramInfo = (
       inputDependencies,
     },
     getRunData: () => ({
-      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
       dispatchGroup: { x: dispatch[0], y: dispatch[1], z: dispatch[2] },
       programUniforms,
     }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
index dbe0e0c9647b..1ad4149b01e0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv-grouped.ts
@@ -145,6 +145,7 @@ export const createGroupedConvVectorizeProgramInfo = (
   inputs: readonly TensorView[],
   attributes: ConvAttributes,
   outputShape: readonly number[],
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const hasBias = inputs.length > 2;
   const components = getMaxComponents(outputShape[3]);
@@ -234,7 +235,12 @@ export const createGroupedConvVectorizeProgramInfo = (
       inputDependencies: hasBias ? ['rank', 'rank', 'type'] : ['rank', 'rank'],
     },
     getRunData: () => ({
-      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
       dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
       programUniforms,
     }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
index 2a32b566ba4b..241aae8c4660 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/conv.ts
@@ -152,9 +152,12 @@ export const parseConvAttributes = (attributes: Record<string, unknown>): ConvAt
   };
 };
 
-const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attributes: ConvAttributes): void => {
-  const adjustedAttributes = getAdjustedConvAttributes(attributes, inputs);
-
+const conv2d = (
+  context: ComputeContext,
+  inputs: readonly TensorView[],
+  attributes: ConvAttributes,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
+): void => {
   // check attributes
 
   // const hasPreluActivationWeights = false; /* TODO: add support for prelu activation weights */
@@ -177,7 +180,7 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
         inputs[0].dims,
         inputs[1].dims,
         attributes.dilations,
-        adjustedAttributes.pads,
+        attributes.pads,
         attributes.strides,
         isChannelsLast,
       );
@@ -194,11 +197,12 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
       if (inputs.length === 3) {
         convInputs.push(inputs[2]);
       }
-      context.compute(createGroupedConvVectorizeProgramInfo(convInputs, adjustedAttributes, outputShape), {
-        inputs: convInputs,
-      });
+      context.compute(
+        createGroupedConvVectorizeProgramInfo(convInputs, attributes, outputShape, squeezeOutputShapeFunction),
+        { inputs: convInputs },
+      );
     } else {
-      context.compute(createGroupedConvProgramInfo(inputs, adjustedAttributes));
+      context.compute(createGroupedConvProgramInfo(inputs, attributes, squeezeOutputShapeFunction));
     }
     return;
   }
@@ -214,7 +218,7 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
     inputs[0].dims,
     inputs[1].dims,
     attributes.dilations,
-    adjustedAttributes.pads,
+    attributes.pads,
     attributes.strides,
     isChannelsLast,
   );
@@ -280,12 +284,26 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
     // Tune the threshold.
     if (N < 8 && K < 8) {
       context.compute(
-        createNaiveMatmulProgramInfo(matmulInputs, adjustedAttributes, outputShape, matmulOutputShape, isChannelsLast),
+        createNaiveMatmulProgramInfo(
+          matmulInputs,
+          attributes,
+          outputShape,
+          matmulOutputShape,
+          isChannelsLast,
+          squeezeOutputShapeFunction,
+        ),
         { inputs: matmulInputs },
       );
     } else {
       context.compute(
-        createMatmulProgramInfo(matmulInputs, adjustedAttributes, outputShape, matmulOutputShape, isChannelsLast),
+        createMatmulProgramInfo(
+          matmulInputs,
+          attributes,
+          outputShape,
+          matmulOutputShape,
+          isChannelsLast,
+          squeezeOutputShapeFunction,
+        ),
         { inputs: matmulInputs },
       );
     }
@@ -320,13 +338,14 @@ const conv2d = (context: ComputeContext, inputs: readonly TensorView[], attribut
   context.compute(
     createConv2DMatMulProgramInfo(
       convInputs,
-      adjustedAttributes,
+      attributes,
       outputShape,
       dimAOuter,
       dimBOuter,
       dimInner,
       hasBias,
       sequentialAccessByThreads,
+      squeezeOutputShapeFunction,
     ),
     { inputs: convInputs },
   );
@@ -357,12 +376,8 @@ const conv1d = (context: ComputeContext, attributes: ConvAttributes): void => {
     { ...attributes, pads, strides, dilations, kernelShape },
     inputs,
   );
-  context.compute(
-    createGroupedConvProgramInfo(inputs, adjustedAttributes, (outputShape) =>
-      isChannelLast
-        ? [outputShape[0], outputShape[2], outputShape[3]]
-        : [outputShape[0], outputShape[1], outputShape[3]],
-    ),
+  conv2d(context, inputs, adjustedAttributes, (outputShape) =>
+    isChannelLast ? [outputShape[0], outputShape[2], outputShape[3]] : [outputShape[0], outputShape[1], outputShape[3]],
   );
 };
 
@@ -398,6 +413,7 @@ export const conv = (context: ComputeContext, attributes: ConvAttributes): void
   } else if (context.inputs[0].dims.length === 5) {
     conv3d(context, context.inputs, attributes);
   } else {
-    conv2d(context, context.inputs, attributes);
+    const adjustedAttributes = getAdjustedConvAttributes(attributes, context.inputs);
+    conv2d(context, context.inputs, adjustedAttributes);
   }
 };
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
index d2a6b2d352e2..7605e67c972b 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/matmul.ts
@@ -32,6 +32,7 @@ export const createNaiveMatmulProgramInfo = (
   outputShape: readonly number[],
   reshapedOutputShape?: readonly number[],
   isChannelsLast = false /* only used for conv2dByMatMul*/,
+  squeezeOutputShapeFunction?: (shape: readonly number[]) => number[],
 ): ProgramInfo => {
   const aShape = inputs[0].dims;
   const bShape = inputs[1].dims;
@@ -120,9 +121,7 @@ export const createNaiveMatmulProgramInfo = (
 
         for (let j = 0; j < aComponents; j++) {
           calcStr += `
-            values[${i}] = fma(${b.type.value}(a_data${aComponents === 1 ? '' : `[${j}]`}), b_data${j}, values[${
-              i
-            }]);\n`;
+            values[${i}] = fma(${b.type.value}(a_data${aComponents === 1 ? '' : `[${j}]`}), b_data${j}, values[${i}]);\n`;
         }
       }
       return calcStr;
@@ -168,7 +167,12 @@ export const createNaiveMatmulProgramInfo = (
       inputDependencies: hasBias ? ['rank', 'rank', 'rank'] : ['rank', 'rank'],
     },
     getRunData: () => ({
-      outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+      outputs: [
+        {
+          dims: squeezeOutputShapeFunction ? squeezeOutputShapeFunction(outputShape) : outputShape,
+          dataType: inputs[0].dataType,
+        },
+      ],
       dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
       programUniforms,
     }),
diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
index 4c1131477cd0..3c08580128e0 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts
@@ -83,14 +83,14 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu
   return {
     name: 'Transpose',
     shaderCache: { hint: `${permAttr}`, inputDependencies: ['rank'] },
-    getRunData: (inputs) => {
+    getRunData: () => {
       const outputSize = ShapeUtil.size(outputShape);
       return {
-        outputs: [{ dims: outputShape, dataType: inputs[0].dataType }],
+        outputs: [{ dims: outputShape, dataType: inputTensor.dataType }],
         dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) },
         programUniforms: [
           { type: DataType.uint32, data: outputSize },
-          ...createTensorShapeVariables(inputs[0].dims, outputShape),
+          ...createTensorShapeVariables(inputTensor.dims, outputShape),
         ],
       };
     },
diff --git a/js/web/test/data/ops/conv1d.jsonc b/js/web/test/data/ops/conv1d.jsonc
new file mode 100644
index 000000000000..a387f0de324a
--- /dev/null
+++ b/js/web/test/data/ops/conv1d.jsonc
@@ -0,0 +1,69 @@
+[
+  {
+    "name": "conv 1D without bias addition A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30],
+            "dims": [1, 1, 3],
+            "type": "float32"
+          },
+          {
+            "data": [1, 2],
+            "dims": [1, 1, 2],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [50, 80],
+            "dims": [1, 1, 2],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  },
+  {
+    "name": "conv 1D with bias addition A",
+    "operator": "Conv",
+    "inputShapeDefinitions": "rankOnly",
+    "opset": { "domain": "", "version": 17 },
+    "attributes": [{ "name": "kernel_shape", "data": [2], "type": "ints" }],
+    "cases": [
+      {
+        "name": "T[0]",
+        "inputs": [
+          {
+            "data": [10, 20, 30, 40],
+            "dims": [1, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+            "dims": [4, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [0.1, 0.2, 0.3, 0.4],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [100.1, 100.2, 100.3, 100.4],
+            "dims": [1, 4, 1],
+            "type": "float32"
+          }
+        ]
+      }
+    ]
+  }
+]
diff --git a/js/web/test/suite-test-list.jsonc b/js/web/test/suite-test-list.jsonc
index 44b89142790a..edbaeb6f4095 100644
--- a/js/web/test/suite-test-list.jsonc
+++ b/js/web/test/suite-test-list.jsonc
@@ -1347,6 +1347,7 @@
       "concat_zero-sized.jsonc",
       "cast.jsonc",
       "conv.jsonc",
+      "conv1d.jsonc",
       "conv3dncdhw.jsonc",
       "cos.jsonc",
       "div.jsonc",