From 0ae44841dc164c9abd7a731bbfd35b95644cd19c Mon Sep 17 00:00:00 2001 From: Roman Kazantsev Date: Thu, 3 Oct 2024 15:44:09 +0400 Subject: [PATCH 1/4] [PT FE][GHA] Run PT FE layer tests on Ubuntu 24.04 with Python 3.12 and NumPy 2.X (#26886) **Details:** Run PT FE layer tests on Ubuntu 24.04 with Python 3.12 and NumPy 2.X Also, this PR contains fixes: - WA sporadic bug on Windows in case parallel run - support PT FE and TF FE layer tests on MacOS x86 - leftovers from code-review **Tickets:** 154003, 153800 --------- Signed-off-by: Kazantsev, Roman --- .github/workflows/job_pytorch_layer_tests.yml | 39 +++++++------------ .../workflows/job_tensorflow_layer_tests.yml | 15 ++++--- .github/workflows/linux_arm64.yml | 6 +-- .github/workflows/mac.yml | 6 +-- .github/workflows/mac_arm64.yml | 6 +-- .github/workflows/ubuntu_22.yml | 6 +-- .github/workflows/ubuntu_24.yml | 10 +++++ .github/workflows/windows_vs2019_release.yml | 6 +-- .../pytorch_tests/test_bitwise_ops.py | 6 ++- tests/requirements_pytorch | 26 +++++++------ tests/requirements_tensorflow | 3 +- 11 files changed, 68 insertions(+), 61 deletions(-) diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml index 88b41f983f7094..50942cf331ab72 100644 --- a/.github/workflows/job_pytorch_layer_tests.yml +++ b/.github/workflows/job_pytorch_layer_tests.yml @@ -7,10 +7,6 @@ on: description: 'Machine on which the tests would run' type: string required: true - shell: - description: "shell to override the default shell settings in the runner's operating system." - type: string - required: true container: description: 'JSON to be converted to the value of the "container" configuration for the job' type: string @@ -20,12 +16,15 @@ on: description: 'Components that are affected by changes in the commit defined by the Smart CI Action' type: string required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true permissions: read-all env: PIP_CACHE_PATH: /mount/caches/pip/linux - PYTHON_VERSION: '3.11' jobs: PyTorch_Layer_Tests: @@ -35,7 +34,7 @@ jobs: container: ${{ fromJSON(inputs.container) }} defaults: run: - shell: ${{ inputs.shell }} + shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }} env: DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input OPENVINO_REPO: ${{ github.workspace }}/openvino @@ -55,12 +54,6 @@ jobs: name: openvino_tests path: ${{ env.INSTALL_TEST_DIR }} - - name: Download OpenVINO tokenizers extension - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 - with: - name: openvino_tokenizers_wheel - path: ${{ env.INSTALL_DIR }} - # Needed as ${{ github.workspace }} is not working correctly when using Docker - name: Setup Variables if: runner.os != 'Windows' @@ -98,10 +91,10 @@ jobs: sparse-checkout-cone-mode: false path: 'openvino' - - name: Setup Python ${{ env.PYTHON_VERSION }} + - name: Setup Python ${{ inputs.python-version }} uses: ./openvino/.github/actions/setup_python with: - version: ${{ env.PYTHON_VERSION }} + version: ${{ inputs.python-version }} pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} should-setup-pip-paths: ${{ runner.os == 'Linux' }} self-hosted-runner: ${{ runner.os == 'Linux' }} @@ -112,9 +105,6 @@ jobs: # Install the core OV wheel python3 -m pip install ${INSTALL_DIR}/tools/openvino-*.whl - # Install the core OV Tokenizers wheel - python3 -m pip install ${INSTALL_DIR}/openvino_tokenizers-*.whl - - name: Install OpenVINO Python wheels (Windows) if: runner.os == 'Windows' run: | @@ -122,10 +112,6 @@ jobs: $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }}\tools -Filter openvino-*.whl | % { $_.FullName } python3 -m pip install "$ovCoreWheelPath" - # Find and install the core OV Tokenizers wheel - $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }} -Filter openvino_tokenizers-*.whl | % { $_.FullName } - python3 -m pip install "$ovCoreWheelPath" - - name: Install Pytorch Layer tests dependencies run: | # pytorch test requirements @@ -133,22 +119,25 @@ jobs: - name: PyTorch Layer Tests if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196 - run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml + # due to CVS-152795, parallel run is not possible on Windows + run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml env: TEST_DEVICE: CPU TEST_PRECISION: FP32 + PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}} - name: PyTorch torch.export Layer Tests - if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287 + if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287 run: | - python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml + python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml env: TEST_DEVICE: CPU TEST_PRECISION: FP32 PYTORCH_TRACING_MODE: EXPORT + PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}} - name: PyTorch torch.compile TORCHFX Layer Tests - if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' }} # Ticket: 126287 + if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287 run: | python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml env: diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml index 0801010b86bde3..e8d7b51e14c02f 100644 --- a/.github/workflows/job_tensorflow_layer_tests.yml +++ b/.github/workflows/job_tensorflow_layer_tests.yml @@ -7,10 +7,6 @@ on: description: 'Machine on which the tests would run' type: string required: true - shell: - description: "shell to override the default shell settings in the runner's operating system." - type: string - required: true container: description: 'JSON to be converted to the value of the "container" configuration for the job' type: string @@ -20,12 +16,15 @@ on: description: 'Components that are affected by changes in the commit defined by the Smart CI Action' type: string required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true permissions: read-all env: PIP_CACHE_PATH: /mount/caches/pip/linux - PYTHON_VERSION: '3.11' jobs: TensorFlow_Layer_Tests: @@ -35,7 +34,7 @@ jobs: container: ${{ fromJSON(inputs.container) }} defaults: run: - shell: ${{ inputs.shell }} + shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }} env: DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input OPENVINO_REPO: ${{ github.workspace }}/openvino @@ -98,10 +97,10 @@ jobs: sparse-checkout-cone-mode: false path: 'openvino' - - name: Setup Python ${{ env.PYTHON_VERSION }} + - name: Setup Python ${{ inputs.python-version }} uses: ./openvino/.github/actions/setup_python with: - version: ${{ env.PYTHON_VERSION }} + version: ${{ inputs.python-version }} pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} should-setup-pip-paths: ${{ runner.os == 'Linux' }} self-hosted-runner: ${{ runner.os == 'Linux' }} diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 3506ca49846f45..e4e608f3aca6d4 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -173,19 +173,19 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'aks-linux-16-cores-arm' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'aks-linux-16-cores-arm' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index da3224fa483ad1..20db9de1776015 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -276,17 +276,17 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'macos-13' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'macos-13' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 331afc7266cd6a..a38179f71fb60c 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -275,17 +275,17 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'macos-13-xlarge' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'macos-13-xlarge' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 8f461391f20a9f..2c20e5136cfc4e 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -305,19 +305,19 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'aks-linux-4-cores-16gb' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] + needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'aks-linux-4-cores-16gb' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index 6409b417a0731b..295a4dd0e2c61a 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -133,6 +133,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + Pytorch_Layer_Tests: + name: Pytorch Layer Tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_pytorch_layer_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.12' + Overall_Status: name: ci/gha_overall_status_ubuntu_24 needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests] diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index 39cf2161525513..122fcc3c1c5021 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -404,17 +404,17 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'aks-win-8-cores-16gb' - shell: pwsh affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'aks-win-8-cores-16gb' - shell: pwsh affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CXX_Unit_Tests: name: C++ unit tests diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py index 1cf458500bcc71..e55a86f279de21 100644 --- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py +++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py @@ -4,6 +4,8 @@ import numpy as np import pytest import torch +from packaging import version + from pytorch_layer_test_class import PytorchLayerTest, skip_if_export @@ -69,10 +71,12 @@ def forward_not_out(self, tensor_a, out): ) @pytest.mark.parametrize("out", [False, skip_if_export(True)]) def test_bitwise_mixed_dtypes( - self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version + self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version ): if ie_device == "GPU" and (lhs_dtype != "bool" or rhs_dtype != "bool"): pytest.xfail(reason="bitwise ops are not supported on GPU") + if out and version.parse(np.__version__) >= version.parse("2.0.0"): + pytest.xfail(reason="CVS-154082: incorrect handling out type") self._test( *self.create_model(op_type, out), ie_device, diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index b82e0c76409057..0d5ac61903b104 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -1,10 +1,14 @@ +# test ovc with NumPy 2.x on Ubuntu 24 with default Python 3.12 +# test against NumPy 1.x with older Python versions # optimum still requires numpy<2.0.0 -numpy==1.26.4 +numpy==1.26.4; python_version < "3.12" +numpy==2.1.1; python_version >= "3.12" torch==2.4.1; platform_system != "Darwin" or platform_machine != "x86_64" -torch==2.2.0; platform_system == "Darwin" and platform_machine == "x86_64" +torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" --extra-index-url https://download.pytorch.org/whl/cpu -torchvision==0.19.1 +torchvision==0.19.1; platform_system != "Darwin" or platform_machine != "x86_64" +torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64" # transformers 4.45.1 is available # but optimum still requires <4.45.0 transformers==4.44.2 @@ -13,22 +17,22 @@ pytest-html==4.1.1 pytest-xdist[psutil]==3.6.1 defusedxml==0.7.1 -auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" +auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" av==13.0.0 -basicsr==1.4.2 +basicsr==1.4.2; python_version < "3.12" datasets==3.0.1 easyocr==1.7.2 -facexlib==0.3.0 -librosa==0.10.2 -optimum==1.22.0 +facexlib==0.3.0; python_version < "3.12" +librosa==0.10.2; python_version < "3.12" +optimum==1.22.0; python_version < "3.12" packaging==24.1 pandas==2.2.3 protobuf==5.28.2 -pyctcdecode==0.5.0 +pyctcdecode==0.5.0; python_version < "3.12" sacremoses==0.1.1 sentencepiece==0.2.0 soundfile==0.12.1 -super-image==0.1.7 +super-image==0.1.7; python_version < "3.12" timm==1.0.8 torchaudio==2.4.1 wheel==0.44.0 @@ -36,7 +40,7 @@ PyYAML==6.0.2 kornia==0.7.3 # use latest released version once it's available -git+https://github.com/huggingface/optimum-intel.git@main +git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12" # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer hf_transfer==0.1.8 diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow index 9d025397ed1fbd..6042eb8a46a9c3 100644 --- a/tests/requirements_tensorflow +++ b/tests/requirements_tensorflow @@ -4,7 +4,8 @@ pytest==7.0.1 pytest-xdist[psutil]==3.6.1 pytest-html==4.1.1 transformers==4.45.1 -tensorflow==2.17.0 +tensorflow==2.17.0; platform_system != "Darwin" or platform_machine != "x86_64" +tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64" # tensorflow-text is not available for both Windows and ARM platforms tensorflow-text==2.17.0; platform_system == "Linux" and platform_machine == "x86_64" tensorflow-hub==0.16.1 From 1b892bfb00fcbccec8db96f66a86e3b1e01f6262 Mon Sep 17 00:00:00 2001 From: Pavel Durandin Date: Thu, 3 Oct 2024 13:43:02 +0400 Subject: [PATCH 2/4] [GPU] Fix double jit constants (#26893) ### Details: - Fix double constant definition --- .../fully_connected/fully_connected_kernel_bf_tiled.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 24641f3eb6aab0..c4115d74f54a92 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para size_t tile_k_ofm_packed = tile_k_ofm; size_t quantize_grp_size = get_dynamic_quantize_group_size(params); + bool add_decompress_scale_post_op = false; WeightsType weights_dt = params.weights.GetDType(); if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) { tile_k_ofm_packed /= 2; @@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance if (scale_group_size % simd == 0 && !dispatchData.use_slm) - jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); + add_decompress_scale_post_op = true; } if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii")); @@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { + if (add_decompress_scale_post_op) + jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); } From 4254c13364ac212e47590184d82c6746bd36aae5 Mon Sep 17 00:00:00 2001 From: "Anastasiya(Asya) Pronina" Date: Thu, 3 Oct 2024 12:27:03 +0200 Subject: [PATCH 3/4] NPUW: Bring back unpack and partitioning unit tests for NPUW (#26885) This PR adds unit tests on 1. unpack routines within NPUW 2. main online partitioning functionality (smaller unit tests on Graph, Group, Repeated, etc will be added separately) Brings back https://github.com/openvinotoolkit/openvino/pull/25780 Local run: ``` [----------] Global test environment tear-down [==========] 334 tests from 6 test suites ran. (3379 ms total) [ PASSED ] 334 tests. ``` --------- Co-authored-by: Alexey Smirnov Co-authored-by: Dmitry Matveev --- .../npuw/partitioning/online/snapshot.hpp | 16 +- src/plugins/intel_npu/tests/CMakeLists.txt | 1 + .../intel_npu/tests/unit/CMakeLists.txt | 46 ++ .../tests/unit/npuw/online_partitioning.cpp | 692 ++++++++++++++++++ .../intel_npu/tests/unit/npuw/unpack.cpp | 103 +++ .../intel_npu/tests/unit/npuw/unpack.hpp | 628 ++++++++++++++++ 6 files changed, 1478 insertions(+), 8 deletions(-) create mode 100644 src/plugins/intel_npu/tests/unit/CMakeLists.txt create mode 100644 src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp create mode 100644 src/plugins/intel_npu/tests/unit/npuw/unpack.cpp create mode 100644 src/plugins/intel_npu/tests/unit/npuw/unpack.hpp diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp index 72a62781580cda..e7e5121b1240e7 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/online/snapshot.hpp @@ -16,8 +16,6 @@ namespace ov { namespace npuw { namespace online { -class Group; // forward declaration - namespace detail { // At partitioning level we exclude some "non-Ops" to not interfere with the passes. // We include some of them back to properly link everything at plugin level @@ -33,6 +31,8 @@ class Snapshot : public std::enable_shared_from_this { m_node_to_prod_cons(std::make_shared()), m_node_to_gr(std::make_shared()) {} + friend class Group; // forward declaration + // Simple passes void singleGroup(); @@ -49,27 +49,27 @@ class Snapshot : public std::enable_shared_from_this { void repeatedBlocks(); void earlyAvoids(); void earlyRegroup(); - void markInternalCompute(); - void resetExcludedRep(); // Utility std::shared_ptr getGraph() const; - size_t graphSize() const; - const detail::OVNodeSet& getNodeProducers(const detail::OVNodePtr& node) const; - const detail::OVNodeSet& getNodeConsumers(const detail::OVNodePtr& node) const; const detail::OVPortsMap& getPortsMap() const; const detail::OVNodeToGroupMapPtr& getNodeToGroupMap() const; const std::map>>& getMatches() const; - detail::GPtrSet getRepGroups(const std::shared_ptr& group) const; void repeat(detail::Pass&& pass); void setCtx(const PassContext& ctx); + size_t graphSize() const; private: + detail::GPtrSet getRepGroups(const std::shared_ptr& group) const; + const detail::OVNodeSet& getNodeProducers(const detail::OVNodePtr& node) const; + const detail::OVNodeSet& getNodeConsumers(const detail::OVNodePtr& node) const; void identifyUniques(); void mergeUniques(); void mergeTriangles(); void cleanUpUniques(); void afterUniques(); + void markInternalCompute(); + void resetExcludedRep(); bool cleanUpUniquesImpl(const detail::GPtrSet& gset); std::shared_ptr tryGrowRepeatingGroups(const detail::GPtrSet& repeating_groups); std::shared_ptr tryMergeTriangles(const detail::GPtrSet& repeating_groups); diff --git a/src/plugins/intel_npu/tests/CMakeLists.txt b/src/plugins/intel_npu/tests/CMakeLists.txt index 4c41f008eb7f81..0f5bd7a6b093b2 100644 --- a/src/plugins/intel_npu/tests/CMakeLists.txt +++ b/src/plugins/intel_npu/tests/CMakeLists.txt @@ -8,3 +8,4 @@ if (MSVC) ov_add_compiler_flags(/wd5105) endif() add_subdirectory(functional) +add_subdirectory(unit) diff --git a/src/plugins/intel_npu/tests/unit/CMakeLists.txt b/src/plugins/intel_npu/tests/unit/CMakeLists.txt new file mode 100644 index 00000000000000..861a0ff6a47076 --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/CMakeLists.txt @@ -0,0 +1,46 @@ +# Copyright (C) 2024 Intel Corporation +# SPDX-License-Identifier: Apache-2.0 +# + +set(TARGET_NAME "ov_npu_unit_tests") + +set(MANDATORY_UNIT_TESTS_LIBS + "openvino::commonTestUtils" + "openvino::gmock" + "openvino::gtest" + "openvino::gtest_main" + "openvino::runtime" + "openvino::npu_al" + "openvino::npu_logger_utils" +) + +ov_add_test_target( + NAME ${TARGET_NAME} + ROOT ${CMAKE_CURRENT_SOURCE_DIR} + ADDITIONAL_SOURCE_DIRS + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/npuw/ + DEPENDENCIES + openvino::runtime + INCLUDES + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/npuw + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/npuw + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/utils/include + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/plugin/include + ${OpenVINO_SOURCE_DIR}/src/plugins/intel_npu/src/al/include + LINK_LIBRARIES + ${MANDATORY_UNIT_TESTS_LIBS} + LABELS + NPUW +) + +if(ENABLE_AVX2) + ov_avx2_optimization_flags(avx2_flags) + target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags}") +endif() + +install(TARGETS ${TARGET_NAME} + RUNTIME DESTINATION tests + COMPONENT tests + EXCLUDE_FROM_ALL +) diff --git a/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp new file mode 100644 index 00000000000000..af1fc5de8e92c7 --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/online_partitioning.cpp @@ -0,0 +1,692 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include + +#include "partitioning/online/compiler.hpp" +#include "partitioning/online/snapshot.hpp" +#include "partitioning/online/group.hpp" + +#include "intel_npu/al/config/config.hpp" +#include "intel_npu/al/config/npuw.hpp" + +#include "openvino/openvino.hpp" +#include "openvino/op/ops.hpp" +#include "openvino/op/util/op_types.hpp" + +bool isEqualEns(ov::npuw::Ensemble& ens1, ov::npuw::Ensemble& ens2); +bool isEqualEns(ov::npuw::Ensemble& ens1, ov::npuw::Ensemble& ens2) { + if (ens1.groups.size() != ens2.groups.size()) { + return false; + } + + for (auto& g : ens1.groups) { + std::sort(g.input_layers.begin(), g.input_layers.end()); + std::sort(g.output_layers.begin(), g.output_layers.end()); + std::sort(g.all_layers.begin(), g.all_layers.end()); + } + + for (auto& g : ens2.groups) { + std::sort(g.input_layers.begin(), g.input_layers.end()); + std::sort(g.output_layers.begin(), g.output_layers.end()); + std::sort(g.all_layers.begin(), g.all_layers.end()); + } + + std::sort(ens1.groups.begin(), ens1.groups.end(), [](const ov::npuw::Group& g1, + const ov::npuw::Group& g2){ + return g1.all_layers.front() < g2.all_layers.front(); + }); + + std::sort(ens2.groups.begin(), ens2.groups.end(), [](const ov::npuw::Group& g1, + const ov::npuw::Group& g2){ + return g1.all_layers.front() < g2.all_layers.front(); + }); + + for (size_t i = 0; i < ens1.groups.size(); ++i) { + const auto& g1 = ens1.groups.at(i); + const auto& g2 = ens2.groups.at(i); + + if (g1.avoid_list != g2.avoid_list || + g1.input_layers != g2.input_layers || + g1.output_layers != g2.output_layers || + g1.all_layers != g2.all_layers) { + return false; + } + + // Can't compare them directly since they are random, but dont't affect the structure + if ((g1.repeated_id.empty() && !g2.repeated_id.empty()) || + (!g1.repeated_id.empty() && g2.repeated_id.empty())) { + return false; + } + } + + if (ens1.repeated.size() != ens2.repeated.size()) { + return false; + } + + auto get_sorted_rep = [](const std::map& rep) { + std::vector>> sorted_rep; + + std::transform(rep.begin(), rep.end(), std::back_inserter(sorted_rep), [](const auto& v) { + return v.second.matches; + }); + + for (auto& g : sorted_rep) { + std::sort(g.begin(), g.end(), + [](const auto& a, const auto& b) {return *a.begin() < *b.begin();}); + } + + std::sort(sorted_rep.begin(), sorted_rep.end(), + [](const auto& a, const auto& b) {return *a.front().begin() < *b.front().begin();}); + + return sorted_rep; + }; + + + if (get_sorted_rep(ens1.repeated) != get_sorted_rep(ens2.repeated)) { + return false; + } + + return true; +} + +class ModelGenerator { +public: + ModelGenerator() = default; + + std::shared_ptr get_model_without_repeated_blocks() { + std::shared_ptr input = std::make_shared(ov::element::i32, ov::Shape{1, 1, 40}); + m_nodes.push_back(input); + set_name(input); + + std::shared_ptr res = get_block(input); + + auto result = std::make_shared(res); + m_nodes.push_back(result); + set_name(result); + + ov::ParameterVector params = {input}; + ov::ResultVector results = {result}; + + return std::make_shared(results, params); + } + + std::shared_ptr get_model_with_repeated_blocks() { + // Generate head + std::shared_ptr input = std::make_shared(ov::element::i32, ov::Shape{1, 1, 40}); + m_nodes.push_back(input); + set_name(input); + + std::vector> head(7, nullptr); + head[0] = std::make_shared(input, input); + head[1] = std::make_shared(ov::element::i32, ov::Shape{1}, std::vector{2}); + head[2] = std::make_shared(head[0], head[1], true); + head[3] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 4, 10}); + head[4] = std::make_shared(ov::element::i64, ov::Shape{3}, std::vector{1, 1, 40}); + head[5] = std::make_shared(head[2], head[3], false); + head[6] = std::make_shared(head[5], head[4], false); + + for (const auto& h : head) { + m_nodes.push_back(h); + set_name(h); + } + + // Generate repeated blocks + std::shared_ptr output = get_block(head[6]); + std::vector> outputs; + outputs.push_back(output); + + for (size_t i = 0; i < 9; ++i) { + output = get_block(output); + outputs.push_back(output); + } + + // Generate tail + std::vector> tail(6, nullptr); + tail[0] = std::make_shared(outputs, -1); + tail[1] = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 20, 20}); + tail[2] = std::make_shared(tail[0], tail[1], false); + tail[3] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1}); + tail[4] = std::make_shared(tail[2], tail[3]); + tail[5] = std::make_shared(tail[4], tail[4]); + + for (const auto& t : tail) { + m_nodes.push_back(t); + set_name(t); + } + + // Create model + auto result = std::make_shared(tail[5]); + m_nodes.push_back(result); + set_name(result); + + ov::ParameterVector params = {input}; + ov::ResultVector results = {result}; + + return std::make_shared(results, params); + } + + std::shared_ptr get_block(const std::shared_ptr& input) { + // Parameters + // input + + // Constants + std::vector> model_c(18, nullptr); + model_c[0] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{0, 2, 1, 3}); + model_c[1] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{1}); + model_c[2] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + model_c[3] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{2}); + model_c[4] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + model_c[5] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 1, 1}); + model_c[6] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{1}); + model_c[7] = std::make_shared(ov::element::i64, ov::Shape{1}, std::vector{0}); + model_c[8] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 1, 1}); + model_c[9] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 1, 1, 2}); + model_c[10] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 1, 1, 1}); + model_c[11] = std::make_shared(ov::element::i32, ov::Shape{4}, std::vector{1, 1, 1, 2}); + model_c[12] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1, 1}); + model_c[13] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1, 1}); + model_c[14] = std::make_shared(ov::element::i32, ov::Shape{1, 1, 1, 1}); + model_c[15] = std::make_shared(ov::element::f32, ov::Shape{40, 40}); + model_c[16] = std::make_shared(ov::element::i64, ov::Shape{4}, std::vector{1, 1, 4, 10}); + model_c[17] = std::make_shared(ov::element::i32, ov::Shape{3}, std::vector{1, 1, 40}); + + for (const auto& c : model_c) { + m_nodes.push_back(c); + set_name(c); + } + + // Converts + std::vector> convert(3, nullptr); + convert[0] = std::make_shared(model_c[15], ov::element::f16); + convert[1] = std::make_shared(convert[0], ov::element::i32); + convert[2] = std::make_shared(model_c[12], ov::element::i32); + + for (const auto& c : convert) { + m_nodes.push_back(c); + set_name(c); + } + + // Ops + std::vector> op(16, nullptr); + op[0] = std::make_shared(input, convert[1], false, true); + op[1] = std::make_shared(op[0], model_c[16], false); + op[2] = std::make_shared(op[1], model_c[0]); + op[3] = std::make_shared(op[2]); + op[4] = std::make_shared(op[3], model_c[1], model_c[2]); + op[5] = std::make_shared(op[4], model_c[3], true); + op[6] = std::make_shared(op[5]); + op[7] = std::make_shared(model_c[5], model_c[6], op[6], model_c[7]); + op[8] = std::make_shared(op[2], + model_c[8], + op[7], + model_c[9], + std::vector{1, 1, 1, 1}, + std::vector{1, 1, 1, 1}); + op[9] = std::make_shared(op[2], + op[7], + model_c[10], + model_c[11], + std::vector{1, 1, 1, 1}, + std::vector{1, 1, 1, 1}); + op[10] = std::make_shared(op[9], convert[2]); + op[11] = std::make_shared(std::vector>{op[10], op[8]}, -1); + op[12] = std::make_shared(model_c[13], op[11]); + op[13] = std::make_shared(model_c[14], op[2]); + op[14] = std::make_shared(op[13], op[12]); + op[15] = std::make_shared(op[14], model_c[17], false); + + for (const auto& o : op) { + m_nodes.push_back(o); + set_name(o); + } + + return op[15]; + } + +private: + void set_name(const std::shared_ptr& node) { + node->set_friendly_name("node_" + std::to_string(m_name_idx++)); + } + + std::vector> m_nodes; + size_t m_name_idx; +}; + +TEST(OnlinePartitioningTest, Partitioning_IsTheSame_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto opt_desc = std::make_shared<::intel_npu::OptionsDesc>(); + auto cfg = ::intel_npu::Config(opt_desc); + ::intel_npu::registerNPUWOptions(*opt_desc); + std::map cfg_map = {{ "NPUW_ONLINE_KEEP_BLOCK_SIZE", "9" }}; + cfg.update(cfg_map); + + auto ens = ov::npuw::online::buildPartitioning(model, cfg); + + for (size_t i = 0; i < 100; ++i) { + auto ens_again = ov::npuw::online::buildPartitioning(model, cfg); + EXPECT_TRUE(isEqualEns(ens, ens_again)); + } +} + +TEST(OnlinePartitioningTest, Partitioning_IsTheSame_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto opt_desc = std::make_shared<::intel_npu::OptionsDesc>(); + auto cfg = ::intel_npu::Config(opt_desc); + ::intel_npu::registerNPUWOptions(*opt_desc); + std::map cfg_map = {{ "NPUW_ONLINE_KEEP_BLOCK_SIZE", "9" }}; + cfg.update(cfg_map); + + auto ens = ov::npuw::online::buildPartitioning(model, cfg); + + for (size_t i = 0; i < 100; ++i) { + auto ens_again = ov::npuw::online::buildPartitioning(model, cfg); + EXPECT_TRUE(isEqualEns(ens, ens_again)); + } +} + +TEST(OnlinePartitioningTest, Partitioning_SingleGroup_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->singleGroup(); + EXPECT_EQ(snap->graphSize(), 1); +} + +TEST(OnlinePartitioningTest, Partitioning_SingleGroup_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->singleGroup(); + EXPECT_EQ(snap->graphSize(), 1); +} + +TEST(OnlinePartitioningTest, Partitioning_buildGraph_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + auto g = snap->getGraph(); + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + } + EXPECT_EQ(snap->getNodeToGroupMap()->size(), snap->graphSize()); +} + +TEST(OnlinePartitioningTest, Partitioning_buildGraph_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + auto g = snap->getGraph(); + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + } + EXPECT_EQ(snap->getNodeToGroupMap()->size(), snap->graphSize()); +} + +TEST(OnlinePartitioningTest, Partitioning_earlyAvoids_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + ov::npuw::online::PassContext ctx; + ctx.avoids = {{ov::npuw::online::PatternType::OP, "Gather", "mydevice"}, {ov::npuw::online::PatternType::OP, "MatMul", "mydevice"}}; + snap->setCtx(ctx); + snap->buildGraph(); + snap->earlyAvoids(); + auto g = snap->getGraph(); + size_t count = 0; + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + if (group->avoidedTargets().size() == 1 && *(group->avoidedTargets().begin()) == "mydevice") { + ++count; + } + } + EXPECT_EQ(count, 2); +} + +TEST(OnlinePartitioningTest, Partitioning_earlyAvoids_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + ov::npuw::online::PassContext ctx; + ctx.avoids = {{ov::npuw::online::PatternType::OP, "Gather", "mydevice"}, {ov::npuw::online::PatternType::OP, "MatMul", "mydevice"}}; + snap->setCtx(ctx); + snap->buildGraph(); + snap->earlyAvoids(); + auto g = snap->getGraph(); + size_t count = 0; + for (const auto& nh : g->sorted()) { + ov::npuw::online::Group::GPtr group = g->meta(nh).get(); + EXPECT_EQ(group->size(), 1); + if (group->avoidedTargets().size() == 1 && *(group->avoidedTargets().begin()) == "mydevice") { + ++count; + } + } + EXPECT_EQ(count, 20); +} + +TEST(OnlinePartitioningTest, Partitioning_collectLHF_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->collectLHF(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_collectLHF_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {82, 82}; + size_t iter = 0; + + snap->repeat([&]{ + snap->collectLHF(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnants_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnants(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnants_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {75, 38, 19, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnants(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnantsExtended_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnantsExtended(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseRemnantsExtended_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {10, 10}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseRemnantsExtended(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseInputs_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {15, 14, 14}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseInputs(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_fuseInputs_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes = {148, 138, 138}; + size_t iter = 0; + + snap->repeat([&]{ + snap->fuseInputs(); + EXPECT_LT(iter, sizes.size()); + EXPECT_EQ(snap->graphSize(), sizes[iter++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Just_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes_lhf = {10, 10}; + size_t iter_lhf = 0; + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + snap->repeat([&] { + snap->collectLHF(); + EXPECT_LT(iter_lhf, sizes_lhf.size()); + EXPECT_EQ(snap->graphSize(), sizes_lhf[iter_lhf++]); + }); + snap->repeat([&] { + snap->fuseRemnants(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Just_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + std::vector sizes_lhf = {82, 82}; + size_t iter_lhf = 0; + + std::vector sizes_fr = {41, 21, 11, 10, 10}; + size_t iter_fr = 0; + + snap->repeat([&] { + snap->collectLHF(); + EXPECT_LT(iter_lhf, sizes_lhf.size()); + EXPECT_EQ(snap->graphSize(), sizes_lhf[iter_lhf++]); + }); + snap->repeat([&] { + snap->fuseRemnants(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_RepeatedBlocks_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 17); + + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 0); + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_RepeatedBlocks_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + snap->buildGraph(); + + + std::vector sizes_fr = {12, 12}; + size_t iter_fr = 0; + + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 18); + + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 1); + + for (const auto& m : matches) { + EXPECT_EQ(m.second.size(), 17); + for (const auto& layers : m.second) { + EXPECT_EQ(layers.size(), 10); + } + } + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Compute_SmallModel) { + ModelGenerator mg; + auto model = mg.get_model_without_repeated_blocks(); + + auto snap = std::make_shared(model); + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + ov::npuw::online::PassContext ctx; + ctx.isolates = {{ov::npuw::online::PatternType::OP, "Transpose", "test_compute"}, {ov::npuw::online::PatternType::OP, "ScatterUpdate", "test_compute"}}; + ctx.nofolds = {"test_compute"}; + snap->setCtx(ctx); + + snap->buildGraph(); + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 17); + + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 0); + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} + +TEST(OnlinePartitioningTest, Partitioning_Compiler_Compute_RepeatedModel) { + ModelGenerator mg; + auto model = mg.get_model_with_repeated_blocks(); + + auto snap = std::make_shared(model); + + std::vector sizes_fr = {10, 10}; + size_t iter_fr = 0; + + ov::npuw::online::PassContext ctx; + ctx.isolates = {{ov::npuw::online::PatternType::OP, "Gather", "test_compute"}, + {ov::npuw::online::PatternType::OP, "ScatterUpdate", "test_compute"}, + {ov::npuw::online::PatternType::OP, "ShapeOf", "test_compute"}, + {ov::npuw::online::PatternType::OP, "Divide", "test_compute"}, + {ov::npuw::online::PatternType::OP, "Floor", "test_compute"}}; + ctx.nofolds = {"test_compute"}; + snap->setCtx(ctx); + + snap->buildGraph(); + snap->earlyAvoids(); + snap->earlyRegroup(); + snap->repeatedBlocks(); + EXPECT_EQ(snap->graphSize(), 29); + + // FIXME: create a config in which there will be repeated blocks + auto matches = snap->getMatches(); + EXPECT_EQ(matches.size(), 0); + + snap->repeat([&] { + snap->fuseRemnantsExtended(); + EXPECT_LT(iter_fr, sizes_fr.size()); + EXPECT_EQ(snap->graphSize(), sizes_fr[iter_fr++]); + }); +} diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp new file mode 100644 index 00000000000000..1049832f6ead7c --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp @@ -0,0 +1,103 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#ifdef HAVE_AVX2 +#include "unpack.hpp" + +namespace { + +const auto TestCases = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::i4}), + ::testing::ValuesIn({ov::element::Type_t::i8, ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test + ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1, 1, 1, 32};}, + Tensors{input={1,1,1, 128};}, + Tensors{input={1,1,1, 390};}, + Tensors{input={1,1,1, 82};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTests, UnpackTests, + TestCases, + UnpackTests::getTestCaseName); + +const auto TestCasesScale = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::i4}), // TODO: add i8 as input for test + ::testing::ValuesIn({ov::element::Type_t::f16, ov::element::Type_t::f32}), + ::testing::ValuesIn({ov::element::Type_t::f16, ov::element::Type_t::f32}), + ::testing::ValuesIn({ov::element::Type_t::undefined}), // no used in this test + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1,32, 128}; scale = {1, 32, 1};}, + Tensors{input={32, 128}; scale = {32, 1};}, + Tensors{input={64, 160}; scale = {64, 1};}, + Tensors{input={1024, 4}; scale = {64, 1};}, + Tensors{input={1, 1, 1024, 4}; scale = {1, 1, 64, 1};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackWithScaleTests, UnpackWithScaleTests, + TestCasesScale, + UnpackWithScaleTests::getTestCaseName); + + +const auto TestCasesScaleAndZeroPoints = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1,32, 128}; scale = {1, 32, 1};}, + Tensors{input={1,64, 160}; scale = {1, 64, 1};}, + Tensors{input={1,1024, 4}; scale = {1, 64, 1};}, + Tensors{input={1,1, 1024, 4}; scale = {1, 1, 64, 1};}, + Tensors{input={64, 1}; scale = {64, 1};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPoint, UnpackTestsWithScaleAndZeroPoint, + TestCasesScaleAndZeroPoints, + UnpackTestsWithScaleAndZeroPoint::getTestCaseName); + +const auto TestCasesScaleAndZeroPoints2 = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::f32}), + ::testing::ValuesIn({ov::element::Type_t::f32}), + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={32, 32, 64}; scale = {32, 1, 64};}, + Tensors{input={64, 64, 128}; scale = {64, 1, 128};}, + Tensors{input={64, 32, 32}; scale = {64, 1, 32};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest2, UnpackTestsWithScaleAndZeroPointTest2, + TestCasesScaleAndZeroPoints2, + UnpackTestsWithScaleAndZeroPointTest2::getTestCaseName); + +const auto TestCasesScaleAndZeroPoints3 = ::testing::Combine( + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::f16}), + ::testing::ValuesIn({ov::element::Type_t::u4}), + ::testing::ValuesIn({3lu, 0lu}), + ::details::ShapesIn({Tensors{input={1, 32, 128}; scale = {1, 32, 1}; zerop = {1, 32, 1};}, + Tensors{input={16, 64, 64}; scale = {16, 64, 1}; zerop = {16, 64, 1};}, + Tensors{input={1, 1024, 4}; scale = {1, 64, 1}; zerop = {1, 32, 1};}}), + ::testing::ValuesIn({true, false}), + ::testing::ValuesIn({true, false}) +); + +INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest3, UnpackTestsWithScaleAndZeroPointTest3, + TestCasesScaleAndZeroPoints3, + UnpackTestsWithScaleAndZeroPointTest3::getTestCaseName); + +} // anonymous namespace + +#endif // __AVX2__ diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp new file mode 100644 index 00000000000000..da5bb4e4720f3e --- /dev/null +++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.hpp @@ -0,0 +1,628 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include +#include +#include +#include +#include +#include +#include + +#include "openvino/runtime/make_tensor.hpp" + +#include "util.hpp" + +namespace { + +#define ASSERT_NO_THROW_WITH_MESSAGE(code) do{ \ + try {\ + code;\ + }catch (const std::exception &ex ) {\ + FAIL()<> 4) | ((x & (1 << 6)) >> 4) | ((x & (1 << 5)) >> 4) | ((x & (1 << 4)) >> 4); +} + +inline int8_t lo4(int8_t x) { + return (x & (1 << 3)) | (x & (1 << 2)) | (x & (1 << 1)) | (x & (1 << 0)); +} + +inline uint8_t hi4(uint8_t x) { + return x >> 4; +} + +inline uint8_t lo4(uint8_t x) { + return x & 0x0F; +} + +inline int8_t upc(int8_t h) { + return h | (-((h & (1 << 3)) >> 3) & (-8)); +} + +typedef unsigned short ushort; +typedef unsigned int uint; + +float half_to_float(const ushort x) { + + __m128i halfVector = _mm_cvtsi32_si128(x); + __m128 floatVector = _mm_cvtph_ps(halfVector); + return _mm_cvtss_f32(floatVector); +} + +ushort float_to_half(const float x) { + __m128 floatVector = _mm_set_ss(x); + __m128i halfVector = _mm_cvtps_ph(floatVector, _MM_FROUND_TO_NEAREST_INT); + return _mm_extract_epi16(halfVector, 0); +} + +inline uint16_t int2hfloat(int8_t x) +{ + float inputFl32 = static_cast(x); + float* inputFl32_ptr = &inputFl32; + unsigned int* fltInt32Ptr = reinterpret_cast(inputFl32_ptr); + unsigned int fltInt32 = *fltInt32Ptr; + unsigned short fltInt16; + + fltInt16 = (fltInt32 >> 31) << 5; + unsigned short tmp = (fltInt32 >> 23) & 0xff; + tmp = (tmp - 0x70) & ((unsigned int)((int)(0x70 - tmp) >> 4) >> 27); + fltInt16 = (fltInt16 | tmp) << 10; + fltInt16 |= (fltInt32 >> 13) & 0x3ff; + + return fltInt16; +} + + +void unpack(const int8_t* in, int8_t* out, int size) { + for (int i = 0; i < size / 2; i++) { + *(out++) = upc(lo4(*in)); + *(out++) = upc(hi4(*in)); + in++; + } +} + +void unpack_i4f16(const int8_t* in, int8_t* out, int size) { + uint16_t *hFloatOut = reinterpret_cast(out); + + for (int i = 0; i < size / 2; i++) { + *(hFloatOut++) = int2hfloat(upc(lo4(*in))); + *(hFloatOut++) = int2hfloat(upc(hi4(*in))); + in++; + } +} + +/*u4 order*/ +void unpack_u4f32(const int8_t* in, float* out, int size) { + for (int i = 0; i < size / 2; i++) { + *(out++) = static_cast(lo4(*in)); + *(out++) = static_cast(hi4(*in)); + in++; + } +} + +template +::testing::AssertionResult fp16ArraysMatch(const T &actual, + const T &expected, + const T &i4Input, + bool int4 = 1 /*i4 or u4*/){ + for (size_t i = 0; i < expected.size() / 2; ++i) { + + int int8Input[] ={ + details::lo4(i4Input[i / 2]), + details::hi4(i4Input[i / 2]) + }; + + if (int4) { + int8Input[0] = details::upc(int8Input[1]); + int8Input[1] = details::upc(int8Input[0]); + }; + + auto fp16ref = int{*((uint16_t*)expected.data() + i)}; + auto fp16out = int{*((uint16_t*)actual.data() + i)}; + +#define _P(x) std::dec << std::setw(5) << (x) << '(' << std::setw(4) << std::hex << (x) << ')' + if (fp16ref != fp16out) { + return ::testing::AssertionFailure() << std::dec << std::setw(4) << i << ", i4:" + << std::setw(2) << int8Input[i % 2] + << " | ref " << _P(fp16ref) + << ", test " << _P(fp16out) << "\n"; + } +#undef _P + + } + + return ::testing::AssertionSuccess(); +} + +} // namespace details + +using ShapesInitializer = std::function&, std::vector&, std::vector&)>; + + +using UnpackTestsParams = std::tuple< + ov::element::Type_t, // fromPrecision + ov::element::Type_t, // toPrecision + ov::element::Type_t, // scalePrecision + ov::element::Type_t, // zeroPointPrecision + unsigned long, // nPartitions + ShapesInitializer, // input_shape , scale_shape, zerop initializer + bool, // use parallel_for + bool // strict partitioning + >; + +class UnpackTestsBase { +protected: + ov::element::Type fromType; + ov::element::Type toType; + ov::element::Type scaleType; + ov::element::Type zeropType; + std::shared_ptr from, to, scale, zerop; + + std::vector input; + std::vector output; + std::vector ref_output; + std::vector scalesStorage; + std::vector zeropStorage; + float zeropValue; + ov::Shape input_shape; + ov::Shape scale_shape; + ov::Shape zerop_shape; + + size_t nPartitions; + bool useParallelFor = false; + bool strictPartitions = false; + + void make_zeropoints() { + if (zeropType == ov::element::undefined) { + return; + } + + const std::vector zeropValues = {15.0f, 12.0f, 0.0f, 31.0f}; + const size_t nElements = shape_size(zerop_shape); + + // Set zeropValue if there's only one element + if (nElements == 1) { + zeropValue = zeropValues.front(); + } + + // Determine the size of the storage based on the type and resize the storage vector + if (zeropType == ov::element::Type_t::u4) { + zeropStorage.resize((nElements + 1) / 2, 0); // Each u4 zeropoint is 4 bits, so two zeropoints fit in one byte + } else if (zeropType == ov::element::Type_t::f32) { + zeropStorage.resize(nElements * sizeof(float), 0); + } else { + ASSERT_TRUE(zeropType == ov::element::u4 || zeropType == ov::element::f32); + } + + // Fill the storage with the appropriate values + if (zeropType == ov::element::Type_t::u4) { + for (size_t i = 0; i < nElements; ++i) { + uint8_t zeropValueU4 = static_cast(zeropValues[i % zeropValues.size()]) & 0x0F; + size_t byteIndex = i / 2; + if (i % 2 == 0) { + zeropStorage[byteIndex] = zeropValueU4; + } else { + zeropStorage[byteIndex] = (zeropValueU4 << 4); + } + } + } else if (zeropType == ov::element::Type_t::f32) { + float* ptrWork = reinterpret_cast(zeropStorage.data()); + for (size_t i = 0; i < nElements; ++i) { + ptrWork[i] = zeropValues[i % zeropValues.size()]; + } + } + + // Create the tensor + zerop = ov::make_tensor(zeropType, zerop_shape, zeropStorage.data()); + } + + void make_scales() { + if (scaleType == ov::element::undefined) { + return; + } + ASSERT_TRUE(scaleType == ov::element::f16 || scaleType == ov::element::f32); + size_t nElements = shape_size(scale_shape); + + // creating custom scale factors + const size_t nScaleBytes = scaleType.bitwidth() * nElements / 8; + + std::vector sc(nElements); + float coeffTable[] = { + 0.1f, + 0.5f, + 1.f, + 2.f + }; + for (size_t i = 0; i != nElements; i++) { + sc[i] = coeffTable[i % (sizeof (coeffTable) / sizeof(*coeffTable))]; + } + scalesStorage.resize(nScaleBytes); + + if (scaleType == ov::element::f16) { + uint16_t * ptrWork = reinterpret_cast(scalesStorage.data()); + for (size_t i = 0; i != nElements; i++) { + ptrWork[i] = details::float_to_half(sc[i]); + } + } + if (scaleType == ov::element::f32) { + float* ptrWork = reinterpret_cast(scalesStorage.data()); + for (size_t i = 0; i != nElements; i++) { + ptrWork[i] = sc[i]; + } + } + scale = ov::make_tensor(scaleType, scale_shape, scalesStorage.data()); + } + + void make_input() { + + size_t nElements = shape_size(input_shape); + + ASSERT_EQ((fromType.bitwidth() * nElements) % 8, 0) << "Input len has to be byte boundary aligned, but was " + << fromType.bitwidth() * nElements << " bits"; + ASSERT_EQ((toType.bitwidth() * nElements) % 8, 0) << "Output len has to be byte boundary aligned"; + + const size_t nInputBytes = fromType.bitwidth() * nElements / 8; + const size_t nOutputBytes = toType.bitwidth() * nElements / 8; + + input.resize(nInputBytes); + ref_output.resize(nOutputBytes); + output.resize(nOutputBytes); + std::fill(ref_output.begin(), ref_output.end(), 0); + std::fill(output.begin(), output.end(), 0); + + std::array input_local = { + 0x0A, 0x0B, 0x1C, 0x1D, 0x2E, 0x2F, 0x35, 0x36, + 0x4A, 0x4B, 0x5A, 0x5B, 0x6A, 0x6B, 0x7A, 0x7B, + 0x0C, 0x0D, 0x1C, 0x1D, 0x2C, 0x2D, 0x3C, 0x3D, + 0x4C, 0x4D, 0x5C, 0x5D, 0x6C, 0x6D, 0x7C, 0x7D, + }; + + for (size_t idx = 0, k = 0; k < nInputBytes; k++, idx = (idx + 1) % input_local.size()) { + input[k] = input_local[idx]; + } + + from = ov::make_tensor(fromType, input_shape, input.data()); + to = ov::make_tensor(toType, input_shape, output.data()); + } +public: + void SetUp(const UnpackTestsParams & getParam) { + ShapesInitializer shapeInit; + + std::tie(fromType, toType, scaleType, zeropType, nPartitions, shapeInit, useParallelFor, strictPartitions) = getParam; + + std::vector input, scale, zerop; + shapeInit(input, scale, zerop); + + input_shape = ov::Shape{input.begin(), input.end()}; + scale_shape = ov::Shape{scale.begin(), scale.end()}; + if (zerop.empty()) { + zerop_shape = ov::Shape({1}); + } else { + zerop_shape = ov::Shape{zerop.begin(), zerop.end()}; + } + + make_input(); + make_scales(); + make_zeropoints(); + + make_ref_output(); + } + std::string ToString() const { + std::ostringstream result; + result << (isNegative() ? "NEGATIVE_" : "") + <<"["; + + for (size_t i = 0; i != input_shape.size(); i++) { + result << input_shape[i] << ((i + 1 == input_shape.size()) ? "" : "x"); + } + result <<"]" + << "_p" << nPartitions + << (strictPartitions ? "_SP" : "") + << (useParallelFor ? "_parallel" : "_serial") + << "_from_" << fromType + << "_to_" << toType; + if (scaleType != ov::element::Type_t::undefined) + result << "_scale_" << scaleType; + if (zeropType != ov::element::Type_t::undefined) + result << "_zerop_" << zeropType; + + return result.str(); + } + + /** + * Negative test cases has to be carefully reviewed, to still remain positive runs at some points + * @return + */ + virtual bool isNegative() const { + return false; + } + + virtual void make_ref_output() { + size_t nElements = 1; + for (size_t dim : input_shape) { + nElements *= dim; + } + if (toType == ov::element::i8) { + details::unpack(input.data(), ref_output.data(), static_cast(nElements)); + } else if (toType == ov::element::f16) { + details::unpack_i4f16(input.data(), ref_output.data(), static_cast(nElements)); + } + } +}; + +template +class UnpackTestsTmpl : + public ::testing::Test, + public T, + public ::testing::WithParamInterface { +protected: + + void SetUp() override { + T::SetUp(GetParam()); + } +public: + static std::string getTestCaseName(const testing::TestParamInfo& obj) { + T _bt; + _bt.SetUp(obj.param); + return _bt.ToString(); + } +}; + +using UnpackTests = UnpackTestsTmpl; +class UnpackTestsRef : public UnpackTests {}; + +TEST_P(UnpackTests, i4) { + ASSERT_NO_THROW_WITH_MESSAGE(ov::npuw::util::unpack(from, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input)); +} + +class UnpackWithScaleTestsBase : public UnpackTestsBase { +protected: + bool isNegative() const override { + if (scale_shape.size() != 3 && scale_shape.size() != 2) return true; + if (input_shape.back() % 64) return true; + if ((from->get_size() / scale->get_size()) % 64) return true; + if (toType != ov::element::f16) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + + const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size(); + + details::unpack_i4f16(input.data(), ref_output.data(), static_cast(nElements)); + + // lets apply per channel scale + uint16_t * pRef = reinterpret_cast(ref_output.data()); + uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t i = 0; i < scale->get_size(); i++) { + for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) { + float ref_scaled = details::half_to_float(pRef[0]); + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[0]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[0]); + } + *pRef = details::float_to_half(ref_scaled); + pRef++; + } + pScale_f32++; + pScale_f16++; + } + } + +}; + +using UnpackWithScaleTests = UnpackTestsTmpl; + + +TEST_P(UnpackWithScaleTests, i4_scale) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input)); + } +} + + +class UnpackTestsWithScaleAndZeroPointBase : public UnpackTestsBase { +protected: + bool isNegative() const override { + if (scale_shape.size() != 3 && scale_shape.size() != 2) return true; + if (input_shape.back() % 64) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + + const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size(); + + std::vector floatRef(nElements); + details::unpack_u4f32(input.data(), floatRef.data(), static_cast(nElements)); + + + // lets apply per channel scale + uint16_t * pRef = reinterpret_cast(ref_output.data()); + float * pFloatRef = reinterpret_cast(floatRef.data()); + const uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + const float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t i = 0; i < scale->get_size(); i++) { + for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) { + // applying zeropoint + float ref_scaled = *pFloatRef - zeropValue; + + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[0]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[0]); + } + *pRef = details::float_to_half(ref_scaled); + + pFloatRef++; + pRef++; + } + pScale_f32++; + pScale_f16++; + } + } +}; + +using UnpackTestsWithScaleAndZeroPoint = UnpackTestsTmpl; + +TEST_P(UnpackTestsWithScaleAndZeroPoint, u4) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false)); + } +} + +class UnpackTestsWithScaleAndZeroPoint2 : public UnpackTestsWithScaleAndZeroPointBase { +protected: + bool isNegative() const override { + if (input_shape.back() % 64 || input_shape.size() != 3) return true; + if (scale_shape.back() % 64 || scale_shape.size() != 3) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + const auto from_shape = from->get_shape(); + + const size_t C = from_shape[from_shape.size() - 3]; + const size_t H = from_shape[from_shape.size() - 2]; + const size_t W = from_shape[from_shape.size() - 1]; + + std::vector floatRef(nElements); + details::unpack_u4f32(input.data(), floatRef.data(), static_cast(nElements)); + + uint16_t * pRef = reinterpret_cast(ref_output.data()); + float * pFloatRef = reinterpret_cast(floatRef.data()); + const uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + const float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t c = 0; c < C; ++c) { + for (size_t h = 0; h < H; ++h) { + for (size_t w = 0; w < W; ++w) { + size_t input_index = w + W * h + W * H * c; + size_t scale_index = w + W * c; + float ref_scaled = pFloatRef[input_index] - zeropValue; + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[scale_index]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[scale_index]); + } + pRef[w + W * h + c * W * H] = details::float_to_half(ref_scaled); + } + } + } + } +}; + +using UnpackTestsWithScaleAndZeroPointTest2 = UnpackTestsTmpl; + +TEST_P(UnpackTestsWithScaleAndZeroPointTest2, u4) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false)); + } +} + +class UnpackTestsWithScaleAndZeroPoint3 : public UnpackTestsWithScaleAndZeroPointBase { +protected: + bool isNegative() const override { + if (scale_shape.size() != 3 || zerop_shape.size() != 3) return true; + if (input_shape[2] % 64 || input_shape.size() != 3) return true; + + return false; + } + + void make_ref_output() override { + if (isNegative()) return; + + size_t nElements = from->get_size(); + + const size_t nOutputElementsPerScale = ref_output.size() / (toType.bitwidth() / 8) / scale->get_size(); + + std::vector floatRef(nElements); + details::unpack_u4f32(input.data(), floatRef.data(), static_cast(nElements)); + + + // lets apply per channel scale + uint16_t * pRef = reinterpret_cast(ref_output.data()); + const uint8_t* pZer = static_cast(zerop->data()); + float * pFloatRef = reinterpret_cast(floatRef.data()); + const uint16_t * pScale_f16 = reinterpret_cast(scale->data()); + const float * pScale_f32 = reinterpret_cast(scale->data()); + + for (size_t i = 0; i < scale->get_size(); i++) { + float zeroPointValue = static_cast((i % 2 == 0) ? details::lo4(pZer[i / 2]) : details::hi4(pZer[i / 2])); + for (size_t sc = 0; sc != nOutputElementsPerScale; sc++) { + // applying zeropoint + float ref_scaled = *pFloatRef - zeroPointValue; + + if (scaleType == ov::element::f32) { + ref_scaled *= pScale_f32[0]; + } else if (scaleType == ov::element::f16) { + ref_scaled *= details::half_to_float(pScale_f16[0]); + } + *pRef = details::float_to_half(ref_scaled); + + pFloatRef++; + pRef++; + } + pScale_f32++; + pScale_f16++; + } + } +}; + +using UnpackTestsWithScaleAndZeroPointTest3 = UnpackTestsTmpl; + +TEST_P(UnpackTestsWithScaleAndZeroPointTest3, u4) { + ASSERT_NO_THROW_IF(!isNegative(), + ov::npuw::util::unpack(from, zerop, scale, to, ov::npuw::util::UnpackOptions{useParallelFor, nPartitions, strictPartitions})); + if (!isNegative()) { + ASSERT_TRUE(details::fp16ArraysMatch(output, ref_output, input, false)); + } +} + +#define Tensors [](std::vector& input, std::vector&scale, std::vector&zerop) + + +namespace details { +::testing::internal::ParamGenerator::value_type> ShapesIn( + const std::vector& container) { + return ::testing::ValuesIn(container.begin(), container.end()); +} + +} // namespace details +} // anonymous namespace From 2fc0faedfa69caf2af5b5cd27c2f3cf5ad2203bf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Hubert=20B=C5=82aszczyk?= <56601011+hub-bla@users.noreply.github.com> Date: Thu, 3 Oct 2024 14:21:01 +0200 Subject: [PATCH 4/4] [TF FE]: Support complex tensors for ExpandDims operation (#26892) ### Details: - Support complex tensors for `ExpandDims` operation + tests ### Tickets: - [None](https://github.com/openvinotoolkit/openvino/issues/22950) --- .../tensorflow_common/src/op/expand_dims.cpp | 30 ++++++++++- .../tensorflow_tests/test_tf_ExpandDims.py | 52 +++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/frontends/tensorflow_common/src/op/expand_dims.cpp b/src/frontends/tensorflow_common/src/op/expand_dims.cpp index b3b37ad38cc302..a40e5c9b1bc6df 100644 --- a/src/frontends/tensorflow_common/src/op/expand_dims.cpp +++ b/src/frontends/tensorflow_common/src/op/expand_dims.cpp @@ -3,7 +3,13 @@ // #include "common_op_table.hpp" +#include "helper_ops/complex_type_mark.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/less.hpp" +#include "openvino/op/select.hpp" +#include "openvino/op/subtract.hpp" #include "openvino/op/unsqueeze.hpp" +#include "utils.hpp" using namespace std; using namespace ov::op; @@ -14,9 +20,31 @@ namespace tensorflow { namespace op { OutputVector translate_expand_dims_op(const NodeContext& node) { - default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"}); + default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"}, true); auto input = node.get_input(0); auto axis = node.get_input(1); + auto complex_type_mark = as_type_ptr(input.get_node_shared_ptr()); + + if (complex_type_mark) { + element::Type complex_part_type = complex_type_mark->get_complex_part_type(); + input = complex_type_mark->input_value(0); + + auto const_zero = create_same_type_const_scalar(axis, 0); + + auto is_axis_neg = make_shared(axis, const_zero); + + auto const_one = create_same_type_const_scalar(axis, 1); + auto axis_min_one = make_shared(axis, const_one); + + auto new_axis = make_shared(is_axis_neg, axis_min_one, axis); + + auto unsqueeze = make_shared(input, new_axis); + + set_node_name(node.get_name(), unsqueeze); + auto complex_result = make_shared(unsqueeze, complex_part_type); + return {complex_result}; + } + auto unsqueeze = make_shared(input, axis); set_node_name(node.get_name(), unsqueeze); return {unsqueeze}; diff --git a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py index f0f9085d32ba2f..e982867c9ac08d 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py @@ -6,6 +6,7 @@ import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest +rng = np.random.default_rng(62362) class TestExpandDims(CommonTFLayerTest): def _prepare_input(self, inputs_info): @@ -40,3 +41,54 @@ def test_expand_dims_basic(self, params, ie_device, precision, ir_version, temp_ self._test(*self.create_expand_dims_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) + + +class TestExpandDimsComplex(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + # generate elements so that the input tensor may contain repeating elements + assert 'param_real:0' in inputs_info + assert 'param_imag:0' in inputs_info + + input_shape = inputs_info['param_real:0'] + + inputs_data = {} + inputs_data['param_real:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32) + inputs_data['param_imag:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32) + + return inputs_data + + def create_expand_dims_complex_net(self, axis_dtype, input_shape, axis): + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + param_real = tf.compat.v1.placeholder(np.float32, input_shape, 'param_real') + param_imag = tf.compat.v1.placeholder(np.float32, input_shape, 'param_imag') + + complex = tf.raw_ops.Complex(real=param_real, imag=param_imag) + + axis = tf.constant(axis, dtype=axis_dtype) + + result = tf.raw_ops.ExpandDims(input=complex, axis=axis) + + tf.raw_ops.Real(input=result) + tf.raw_ops.Imag(input=result) + + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + test_basic = [ + dict(input_shape=[], axis=0), + dict(input_shape=[2, 3], axis=1), + dict(input_shape=[2, 3, 4], axis=-1), + dict(input_shape=[2, 6, 5], axis=-2), + ] + + @pytest.mark.parametrize("axis_dtype", [np.int32, np.int64]) + @pytest.mark.parametrize("op_args", test_basic) + @pytest.mark.nightly + @pytest.mark.precommit + def test_expand_dims_basic_complex(self, axis_dtype, op_args, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): + self._test(*self.create_expand_dims_complex_net(axis_dtype, **op_args), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend)