Merge branch 'master' into use-intrinsics-in-core-convert

openvinotoolkit · Oct 3, 2024 · 5f52d43 · 5f52d43
2 parents 58e7f17 + 1b892bf
commit 5f52d43
Show file tree

Hide file tree

Showing 13 changed files with 102 additions and 69 deletions.
diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml
@@ -7,10 +7,6 @@ on:
         description: 'Machine on which the tests would run'
         type: string
         required: true
-      shell:
-        description: "shell to override the default shell settings in the runner's operating system."
-        type: string
-        required: true
       container:
         description: 'JSON to be converted to the value of the "container" configuration for the job'
         type: string
@@ -20,12 +16,15 @@ on:
         description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
         type: string
         required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
 
 permissions: read-all
 
 env:
   PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
 
 jobs:
   PyTorch_Layer_Tests:
@@ -35,7 +34,7 @@ jobs:
     container: ${{ fromJSON(inputs.container) }}
     defaults:
       run:
-        shell: ${{ inputs.shell }}
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       OPENVINO_REPO: ${{ github.workspace }}/openvino
@@ -55,12 +54,6 @@ jobs:
           name: openvino_tests
           path: ${{ env.INSTALL_TEST_DIR }}
 
-      - name: Download OpenVINO tokenizers extension
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: openvino_tokenizers_wheel
-          path: ${{ env.INSTALL_DIR }}
-
       # Needed as ${{ github.workspace }} is not working correctly when using Docker
       - name: Setup Variables
         if: runner.os != 'Windows'
@@ -98,10 +91,10 @@ jobs:
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
+      - name: Setup Python ${{ inputs.python-version }}
         uses: ./openvino/.github/actions/setup_python
         with:
-          version: ${{ env.PYTHON_VERSION }}
+          version: ${{ inputs.python-version }}
           pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
           should-setup-pip-paths: ${{ runner.os == 'Linux' }}
           self-hosted-runner: ${{ runner.os == 'Linux' }}
@@ -112,43 +105,39 @@ jobs:
           # Install the core OV wheel
           python3 -m pip install ${INSTALL_DIR}/tools/openvino-*.whl
 
-          # Install the core OV Tokenizers wheel
-          python3 -m pip install ${INSTALL_DIR}/openvino_tokenizers-*.whl
-
       - name: Install OpenVINO Python wheels (Windows)
         if: runner.os == 'Windows'
         run: |
           # Find and install the core OV wheel
           $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }}\tools -Filter openvino-*.whl | % { $_.FullName }
           python3 -m pip install "$ovCoreWheelPath"
 
-          # Find and install the core OV Tokenizers wheel
-          $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }} -Filter openvino_tokenizers-*.whl | % { $_.FullName }
-          python3 -m pip install "$ovCoreWheelPath"
-
       - name: Install Pytorch Layer tests dependencies
         run: |
           # pytorch test requirements
           python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/requirements_pytorch
 
       - name: PyTorch Layer Tests
         if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196
-        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+        # due to CVS-152795, parallel run is not possible on Windows
+        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
 
       - name: PyTorch torch.export Layer Tests
-        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287
+        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
-          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
           PYTORCH_TRACING_MODE: EXPORT
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
 
       - name: PyTorch torch.compile TORCHFX Layer Tests
-        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' }} # Ticket: 126287
+        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
           python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:

diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml
@@ -7,10 +7,6 @@ on:
         description: 'Machine on which the tests would run'
         type: string
         required: true
-      shell:
-        description: "shell to override the default shell settings in the runner's operating system."
-        type: string
-        required: true
       container:
         description: 'JSON to be converted to the value of the "container" configuration for the job'
         type: string
@@ -20,12 +16,15 @@ on:
         description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
         type: string
         required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
 
 permissions: read-all
 
 env:
   PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
 
 jobs:
   TensorFlow_Layer_Tests:
@@ -35,7 +34,7 @@ jobs:
     container: ${{ fromJSON(inputs.container) }}
     defaults:
       run:
-        shell: ${{ inputs.shell }}
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       OPENVINO_REPO: ${{ github.workspace }}/openvino
@@ -98,10 +97,10 @@ jobs:
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
+      - name: Setup Python ${{ inputs.python-version }}
         uses: ./openvino/.github/actions/setup_python
         with:
-          version: ${{ env.PYTHON_VERSION }}
+          version: ${{ inputs.python-version }}
           pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
           should-setup-pip-paths: ${{ runner.os == 'Linux' }}
           self-hosted-runner: ${{ runner.os == 'Linux' }}

diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
@@ -173,19 +173,19 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Docker, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests

diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
@@ -276,17 +276,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'macos-13'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'macos-13'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests

diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
@@ -275,17 +275,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'macos-13-xlarge'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'macos-13-xlarge'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests

diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
@@ -305,19 +305,19 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Docker, Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests

diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml
@@ -133,6 +133,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.12'
 
+  Pytorch_Layer_Tests:
+    name: Pytorch Layer Tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_pytorch_layer_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.12'
+
   Overall_Status:
     name: ci/gha_overall_status_ubuntu_24
     needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests]

diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml
@@ -404,17 +404,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-win-8-cores-16gb'
-      shell: pwsh
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-win-8-cores-16gb'
-      shell: pwsh
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CXX_Unit_Tests:
     name: C++ unit tests

diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
     uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
 #endif
 
+#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+    const int power_of_two_for_simd = 5;
+    const int power_of_two_for_osv = 6;
+    const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv);
+    const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1);
+    const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd;
+    // out_f(32)  : 0  * osv_weight_stride + 32;
+    // out_f(64)  : 64 * osv_weight_stride + 0;
+    // out_f(128) : 64 * osv_weight_stride + 32;
+    // ...
+    uint weights_offset =  osv64_weight_base * osv_weight_stride + out_f_offset;
+#else
     uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2);
+#endif
 
     ACCUMULATOR_VEC_TYPE    acc[TILE_B] = { };
 
@@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
 
         __local int* char_slm_weight = (__local int*)wei_local_mem;
 
+        #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+        uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2;
+        #else
         uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE;
+        #endif
         uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2;
 
         // DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE
@@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                 // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
                 dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
                 dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
+            #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+                SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
+                SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD)));
+                DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
+                DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp;
+                dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
+                dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
+                dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01;
+                dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45;
+                dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23;
+                dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67;
             #else
                 SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
                 DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
@@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                 acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight);
             }
 
-            #if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2
-                weights_offset += (TILE_K_OFM_PACKED/2) * SIMD;
-            #else
-                weights_offset += TILE_K_OFM_PACKED * SIMD;
-            #endif
+            weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;
 
             #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
                 unroll_for (uint bi = 0; bi < TILE_B; ++bi) {

diff --git a/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
     size_t tile_k_ofm_packed = tile_k_ofm;
     size_t quantize_grp_size = get_dynamic_quantize_group_size(params);
 
+    bool add_decompress_scale_post_op = false;
     WeightsType weights_dt = params.weights.GetDType();
     if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) {
         tile_k_ofm_packed /= 2;
@@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
         // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance
         if (scale_group_size % simd == 0 && !dispatchData.use_slm)
-            jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
+            add_decompress_scale_post_op = true;
     }
     if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
         jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
@@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
     } else {
+        if (add_decompress_scale_post_op)
+            jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
         jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size));
     }
@@ -781,8 +784,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
     auto output_f = get_output_aligned_bf_size(fc_params, false).second;
 
     WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
-    // TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed
-    if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
+    if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
         && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
         && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
         && is_weight_horizontal(fc_params, output_f)) {