diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml index 88b41f983f7094..50942cf331ab72 100644 --- a/.github/workflows/job_pytorch_layer_tests.yml +++ b/.github/workflows/job_pytorch_layer_tests.yml @@ -7,10 +7,6 @@ on: description: 'Machine on which the tests would run' type: string required: true - shell: - description: "shell to override the default shell settings in the runner's operating system." - type: string - required: true container: description: 'JSON to be converted to the value of the "container" configuration for the job' type: string @@ -20,12 +16,15 @@ on: description: 'Components that are affected by changes in the commit defined by the Smart CI Action' type: string required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true permissions: read-all env: PIP_CACHE_PATH: /mount/caches/pip/linux - PYTHON_VERSION: '3.11' jobs: PyTorch_Layer_Tests: @@ -35,7 +34,7 @@ jobs: container: ${{ fromJSON(inputs.container) }} defaults: run: - shell: ${{ inputs.shell }} + shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }} env: DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input OPENVINO_REPO: ${{ github.workspace }}/openvino @@ -55,12 +54,6 @@ jobs: name: openvino_tests path: ${{ env.INSTALL_TEST_DIR }} - - name: Download OpenVINO tokenizers extension - uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8 - with: - name: openvino_tokenizers_wheel - path: ${{ env.INSTALL_DIR }} - # Needed as ${{ github.workspace }} is not working correctly when using Docker - name: Setup Variables if: runner.os != 'Windows' @@ -98,10 +91,10 @@ jobs: sparse-checkout-cone-mode: false path: 'openvino' - - name: Setup Python ${{ env.PYTHON_VERSION }} + - name: Setup Python ${{ inputs.python-version }} uses: ./openvino/.github/actions/setup_python with: - version: ${{ env.PYTHON_VERSION }} + version: ${{ inputs.python-version }} pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} should-setup-pip-paths: ${{ runner.os == 'Linux' }} self-hosted-runner: ${{ runner.os == 'Linux' }} @@ -112,9 +105,6 @@ jobs: # Install the core OV wheel python3 -m pip install ${INSTALL_DIR}/tools/openvino-*.whl - # Install the core OV Tokenizers wheel - python3 -m pip install ${INSTALL_DIR}/openvino_tokenizers-*.whl - - name: Install OpenVINO Python wheels (Windows) if: runner.os == 'Windows' run: | @@ -122,10 +112,6 @@ jobs: $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }}\tools -Filter openvino-*.whl | % { $_.FullName } python3 -m pip install "$ovCoreWheelPath" - # Find and install the core OV Tokenizers wheel - $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }} -Filter openvino_tokenizers-*.whl | % { $_.FullName } - python3 -m pip install "$ovCoreWheelPath" - - name: Install Pytorch Layer tests dependencies run: | # pytorch test requirements @@ -133,22 +119,25 @@ jobs: - name: PyTorch Layer Tests if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196 - run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml + # due to CVS-152795, parallel run is not possible on Windows + run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml env: TEST_DEVICE: CPU TEST_PRECISION: FP32 + PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}} - name: PyTorch torch.export Layer Tests - if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287 + if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287 run: | - python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml + python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml env: TEST_DEVICE: CPU TEST_PRECISION: FP32 PYTORCH_TRACING_MODE: EXPORT + PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}} - name: PyTorch torch.compile TORCHFX Layer Tests - if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' }} # Ticket: 126287 + if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287 run: | python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml env: diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml index 0801010b86bde3..e8d7b51e14c02f 100644 --- a/.github/workflows/job_tensorflow_layer_tests.yml +++ b/.github/workflows/job_tensorflow_layer_tests.yml @@ -7,10 +7,6 @@ on: description: 'Machine on which the tests would run' type: string required: true - shell: - description: "shell to override the default shell settings in the runner's operating system." - type: string - required: true container: description: 'JSON to be converted to the value of the "container" configuration for the job' type: string @@ -20,12 +16,15 @@ on: description: 'Components that are affected by changes in the commit defined by the Smart CI Action' type: string required: true + python-version: + description: 'Python version to setup. E.g., "3.11"' + type: string + required: true permissions: read-all env: PIP_CACHE_PATH: /mount/caches/pip/linux - PYTHON_VERSION: '3.11' jobs: TensorFlow_Layer_Tests: @@ -35,7 +34,7 @@ jobs: container: ${{ fromJSON(inputs.container) }} defaults: run: - shell: ${{ inputs.shell }} + shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }} env: DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input OPENVINO_REPO: ${{ github.workspace }}/openvino @@ -98,10 +97,10 @@ jobs: sparse-checkout-cone-mode: false path: 'openvino' - - name: Setup Python ${{ env.PYTHON_VERSION }} + - name: Setup Python ${{ inputs.python-version }} uses: ./openvino/.github/actions/setup_python with: - version: ${{ env.PYTHON_VERSION }} + version: ${{ inputs.python-version }} pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }} should-setup-pip-paths: ${{ runner.os == 'Linux' }} self-hosted-runner: ${{ runner.os == 'Linux' }} diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml index 3506ca49846f45..e4e608f3aca6d4 100644 --- a/.github/workflows/linux_arm64.yml +++ b/.github/workflows/linux_arm64.yml @@ -173,19 +173,19 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'aks-linux-16-cores-arm' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Docker, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'aks-linux-16-cores-arm' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml index da3224fa483ad1..20db9de1776015 100644 --- a/.github/workflows/mac.yml +++ b/.github/workflows/mac.yml @@ -276,17 +276,17 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'macos-13' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'macos-13' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml index 331afc7266cd6a..a38179f71fb60c 100644 --- a/.github/workflows/mac_arm64.yml +++ b/.github/workflows/mac_arm64.yml @@ -275,17 +275,17 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'macos-13-xlarge' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'macos-13-xlarge' - shell: bash affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml index 8f461391f20a9f..2c20e5136cfc4e 100644 --- a/.github/workflows/ubuntu_22.yml +++ b/.github/workflows/ubuntu_22.yml @@ -305,19 +305,19 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'aks-linux-4-cores-16gb' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ] + needs: [ Docker, Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'aks-linux-4-cores-16gb' - shell: bash container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}' affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CPU_Functional_Tests: name: CPU functional tests diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml index 6409b417a0731b..295a4dd0e2c61a 100644 --- a/.github/workflows/ubuntu_24.yml +++ b/.github/workflows/ubuntu_24.yml @@ -133,6 +133,16 @@ jobs: affected-components: ${{ needs.smart_ci.outputs.affected_components }} python-version: '3.12' + Pytorch_Layer_Tests: + name: Pytorch Layer Tests + needs: [ Docker, Build, Smart_CI ] + uses: ./.github/workflows/job_pytorch_layer_tests.yml + with: + runner: 'aks-linux-4-cores-16gb' + container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}' + affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.12' + Overall_Status: name: ci/gha_overall_status_ubuntu_24 needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests] diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml index 39cf2161525513..122fcc3c1c5021 100644 --- a/.github/workflows/windows_vs2019_release.yml +++ b/.github/workflows/windows_vs2019_release.yml @@ -404,17 +404,17 @@ jobs: uses: ./.github/workflows/job_tensorflow_layer_tests.yml with: runner: 'aks-win-8-cores-16gb' - shell: pwsh affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' Pytorch_Layer_Tests: name: Pytorch Layer Tests - needs: [ Build, Smart_CI, Openvino_tokenizers ] + needs: [ Build, Smart_CI ] uses: ./.github/workflows/job_pytorch_layer_tests.yml with: runner: 'aks-win-8-cores-16gb' - shell: pwsh affected-components: ${{ needs.smart_ci.outputs.affected_components }} + python-version: '3.11' CXX_Unit_Tests: name: C++ unit tests diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst index d9f5e25c332984..7b135fa7ff0b14 100644 --- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst @@ -11,6 +11,7 @@ NPU Device :hidden: npu-device/remote-tensor-api-npu-plugin + npu-device/batching-on-npu-plugin The Neural Processing Unit is a low-power hardware solution, introduced with the diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst new file mode 100644 index 00000000000000..379822e327c8cd --- /dev/null +++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst @@ -0,0 +1,37 @@ +NPU Plugin Batching +=============================== + + +.. meta:: + :description: OpenVINO™ NPU plugin supports batching + either by executing concurrent inferences or by + relying on native compiler support for batching. + +OpenVINO™ NPU plugin supports batching either by executing concurrent inferences or by relying on native compiler support for batching. + +First, the NPU plugin checks if the following conditions are met: + +* The batch size is on the first axis. +* All inputs and outputs have the same batch size. +* The model does not contain states. + +**If the conditions are met**, the NPU plugin attempts to compile and execute the original model with batch_size forced to 1. This approach is due to current compiler limitations and ongoing work to improve performance for batch_size greater than one. +If the compilation is successful, the plugin detects a difference in batch size between the original model layout (with a batch size set to N) +and the transformed/compiled layout (with a batch size set to 1). Then it executes the following steps: + +1. Internally constructs multiple command lists, one for each input. +2. Executes each command list for the proper offsets of input/output buffers. +3. Notifies the user of the completion of the inference request after all command lists have been executed. + +This concurrency-based batching mode is transparent to the application. A single inference request handles all inputs from the batch. +While performance may be lower compared to regular batching (based on native compiler support), this mode provides basic batching functionality for use either with older drivers +or when the model cannot yet be compiled with a batch size larger than one. + +**If the conditions are not met**, the NPU plugin tries to compile and execute the original model with the given +batch_size to N as any other regular model. + +.. note:: + + With future performance improvements and support for compiling multiple models with a batch size larger + than one, the default order will change. NPU will try first to compile and execute the original model with the + given batch size and fall back to concurrent batching if compilation fails. diff --git a/src/bindings/js/node/lib/addon.ts b/src/bindings/js/node/lib/addon.ts index 060af2cfec92e8..24c9d780aa9f7e 100644 --- a/src/bindings/js/node/lib/addon.ts +++ b/src/bindings/js/node/lib/addon.ts @@ -21,6 +21,8 @@ type elementTypeString = | 'f32' | 'string'; +type OVAny = string | number | boolean; + /** * Core represents an OpenVINO runtime Core entity. * @@ -48,7 +50,7 @@ interface Core { compileModel( model: Model, deviceName: string, - config?: { [propertyName: string]: string }, + config?: Record, ): Promise; /** * Asynchronously reads a model and creates a compiled model @@ -67,7 +69,7 @@ interface Core { compileModel( modelPath: string, deviceName: string, - config?: { [propertyName: string]: string }, + config?: Record, ): Promise; /** * A synchronous version of {@link Core.compileModel}. @@ -76,7 +78,7 @@ interface Core { compileModelSync( model: Model, deviceName: string, - config?: { [propertyName: string]: string }, + config?: Record, ): CompiledModel; /** * A synchronous version of {@link Core.compileModel}. @@ -85,7 +87,7 @@ interface Core { compileModelSync( modelPath: string, deviceName: string, - config?: { [propertyName: string]: string }, + config?: Record, ): CompiledModel; /** * It returns a list of available inference devices. @@ -101,7 +103,7 @@ interface Core { * It gets the properties dedicated to device behaviour. * @param propertyName A property name. */ - getProperty(propertyName: string): string | number | boolean; + getProperty(propertyName: string): OVAny; /** * It gets the properties dedicated to device behaviour. @@ -111,7 +113,7 @@ interface Core { getProperty( deviceName: string, propertyName: string, - ): string | number | boolean; + ): OVAny; /** * It returns information on the version of device plugins. * @param deviceName A device name to identify a plugin. @@ -135,7 +137,7 @@ interface Core { importModel( modelStream: Buffer, device: string, - config?: { [key: string]: string | number | boolean }, + config?: Record, ): Promise; /** * A synchronous version of {@link Core.importModel}. @@ -144,7 +146,7 @@ interface Core { importModelSync( modelStream: Buffer, device: string, - config?: { [key: string]: string | number | boolean }, + config?: Record, ): CompiledModel; /** * It reads models from the IR / ONNX / PDPD / TF and TFLite formats. @@ -197,16 +199,13 @@ interface Core { * It sets the properties. * @param properties An object with the property name - property value pairs. */ - setProperty(properties: { [key: string]: string | number | boolean }): void; + setProperty(properties: Record): void; /** * It sets the properties for a device. * @param deviceName The name of a device. * @param properties An object with the property name - property value pairs. */ - setProperty( - deviceName: string, - properties: { [key: string]: string | number | boolean }, - ): void; + setProperty(deviceName: string, properties: Record): void; /** * It queries the device if it supports specified model with the specified * properties. @@ -218,8 +217,8 @@ interface Core { queryModel( model: Model, deviceName: string, - properties?: {[key: string]: string | number | boolean}, - ): {[key: string]: string | number | boolean}; + properties?: Record, + ): { [key: string]: string }; } interface CoreConstructor { new (): Core; @@ -325,7 +324,7 @@ interface CompiledModel { * @param propertyName A string to get the property value. * @returns The property value. */ - getProperty(propertyName: string): string | number | boolean; + getProperty(propertyName: string): OVAny; /** * It creates an inference request object used to infer the compiled model. * @return {InferRequest} @@ -380,9 +379,7 @@ interface CompiledModel { * @param property An object with the key-value pairs. * (property name, property value) */ - setProperty(properties: { - [propertyName: string]: string | number | boolean; - }): void; + setProperty(properties: Record): void; } /** diff --git a/src/bindings/js/node/tests/unit/core.test.js b/src/bindings/js/node/tests/unit/core.test.js index 6cf431a38b5030..f62adda9f90f9c 100644 --- a/src/bindings/js/node/tests/unit/core.test.js +++ b/src/bindings/js/node/tests/unit/core.test.js @@ -12,11 +12,11 @@ describe('ov.Core tests', () => { before(async () => { await isModelAvailable(testModels.testModelFP32); }); - + beforeEach(() => { core = new ov.Core(); }); - + it('Core.setProperty()', () => { const tmpDir = '/tmp'; @@ -83,29 +83,29 @@ describe('ov.Core tests', () => { it('Core.queryModel() with empty parameters should throw an error', () => { assert.throws( () => core.queryModel().then(), - /'queryModel' method called with incorrect parameters./ - ) + /'queryModel' method called with incorrect parameters./, + ); }); it('Core.queryModel() with less arguments should throw an error', () => { assert.throws( - () => core.queryModel("Unexpected Argument").then(), - /'queryModel' method called with incorrect parameters./ - ) + () => core.queryModel('Unexpected Argument').then(), + /'queryModel' method called with incorrect parameters./, + ); }); it('Core.queryModel() with incorrect arguments should throw an error', () => { const model = core.readModelSync(getModelPath().xml); assert.throws( - () => core.queryModel(model, "arg1", "arg2").then(), - /'queryModel' method called with incorrect parameters./ - ) + () => core.queryModel(model, 'arg1', 'arg2').then(), + /'queryModel' method called with incorrect parameters./, + ); }); it('Core.queryModel() should have device in the result values', () => { const model = core.readModelSync(getModelPath().xml); const device = 'CPU'; - const query_model = core.queryModel(model, device); - assert(Object.values(query_model).includes(device)); + const queryModel = core.queryModel(model, device); + assert(Object.values(queryModel).includes(device)); }); }); diff --git a/src/core/src/bound_evaluate.cpp b/src/core/src/bound_evaluate.cpp index 22b91a15e3dcee..f1c6a0601eea90 100644 --- a/src/core/src/bound_evaluate.cpp +++ b/src/core/src/bound_evaluate.cpp @@ -494,14 +494,12 @@ bool ov::interval_bound_evaluator(const Node* node, vector_of_output_variants.emplace_back(output.get_element_type(), output.get_shape()); } - node->evaluate(vector_of_output_variants, input_variant); + if (!node->evaluate(vector_of_output_variants, input_variant)) { + return false; + }; TensorVector vector_of_unsqueezed_output_variants; for (const auto& output : vector_of_output_variants) { - if (!output) { - return false; - } - auto unsqueezed_shape = output.get_shape(); unsqueezed_shape.insert(unsqueezed_shape.begin(), 1); diff --git a/src/frontends/tensorflow_common/src/op/expand_dims.cpp b/src/frontends/tensorflow_common/src/op/expand_dims.cpp index b3b37ad38cc302..a40e5c9b1bc6df 100644 --- a/src/frontends/tensorflow_common/src/op/expand_dims.cpp +++ b/src/frontends/tensorflow_common/src/op/expand_dims.cpp @@ -3,7 +3,13 @@ // #include "common_op_table.hpp" +#include "helper_ops/complex_type_mark.hpp" +#include "openvino/op/constant.hpp" +#include "openvino/op/less.hpp" +#include "openvino/op/select.hpp" +#include "openvino/op/subtract.hpp" #include "openvino/op/unsqueeze.hpp" +#include "utils.hpp" using namespace std; using namespace ov::op; @@ -14,9 +20,31 @@ namespace tensorflow { namespace op { OutputVector translate_expand_dims_op(const NodeContext& node) { - default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"}); + default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"}, true); auto input = node.get_input(0); auto axis = node.get_input(1); + auto complex_type_mark = as_type_ptr(input.get_node_shared_ptr()); + + if (complex_type_mark) { + element::Type complex_part_type = complex_type_mark->get_complex_part_type(); + input = complex_type_mark->input_value(0); + + auto const_zero = create_same_type_const_scalar(axis, 0); + + auto is_axis_neg = make_shared(axis, const_zero); + + auto const_one = create_same_type_const_scalar(axis, 1); + auto axis_min_one = make_shared(axis, const_one); + + auto new_axis = make_shared(is_axis_neg, axis_min_one, axis); + + auto unsqueeze = make_shared(input, new_axis); + + set_node_name(node.get_name(), unsqueeze); + auto complex_result = make_shared(unsqueeze, complex_part_type); + return {complex_result}; + } + auto unsqueeze = make_shared(input, axis); set_node_name(node.get_name(), unsqueeze); return {unsqueeze}; diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp index 71623f32843eac..63adae28ddabf3 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp @@ -4,17 +4,15 @@ #pragma once -#include "openvino/runtime/threading/cpu_streams_executor.hpp" +#include "openvino/runtime/threading/istreams_executor.hpp" #include "intel_gpu/graph/topology.hpp" #include "intel_gpu/graph/program.hpp" #include "intel_gpu/graph/serialization/binary_buffer.hpp" -#include "intel_gpu/runtime/compounds.hpp" #include "intel_gpu/runtime/memory.hpp" #include "intel_gpu/runtime/engine.hpp" #include "intel_gpu/runtime/event.hpp" #include "intel_gpu/runtime/stream.hpp" -#include "intel_gpu/runtime/lru_cache.hpp" #include "intel_gpu/runtime/shape_predictor.hpp" #include "intel_gpu/plugin/variable_state.hpp" @@ -211,7 +209,7 @@ struct network { bool is_dynamic() const { return _is_dynamic; } size_t get_weights_cache_capacity() const { return _weights_cache_capacity; } - memory_pool& get_memory_pool() { + memory_pool& get_memory_pool() const { return *_memory_pool; } @@ -284,7 +282,9 @@ struct network { void dump_memory_pool(std::string dump_path, int64_t curr_iter); #ifdef GPU_DEBUG_CONFIG - int64_t iteration = 0; + mutable int64_t iteration = 0; + friend class NetworkDebugHelper; + friend class NodeDebugHelper; #endif }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.cpp b/src/plugins/intel_gpu/src/graph/debug_helper.cpp new file mode 100644 index 00000000000000..7f7071e704683e --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/debug_helper.cpp @@ -0,0 +1,526 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "debug_helper.hpp" +#include "openvino/util/file_util.hpp" + +#ifdef GPU_DEBUG_CONFIG + +#include "to_string_utils.h" +#include "loop_inst.h" +#include "condition_inst.h" +#include "program_dump_graph.h" + +#include +#include +#include + +namespace cldnn { + +namespace { + +float convert_element(int64_t i) { return static_cast(i); } +float convert_element(int32_t i) { return static_cast(i); } + +float convert_element(float f) { return f; } + +float convert_element(ov::float16 h) { return static_cast(h); } + +size_t get_x_pitch(const layout& layout) { + try { + auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0)); + auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0)); + auto x0 = layout.get_linear_offset(tensor_x0); + auto x1 = layout.get_linear_offset(tensor_x1); + return (x1 - x0); + } catch (...) { + // When spatial size of x=0, x_pitch is meaningless + return 0; + } +} + +template +void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) { + auto&& size = mem->get_layout().get_tensor(); + + GPU_DEBUG_GET_INSTANCE(debug_config); + auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1); + tensor tmp_size(size); + tmp_size.batch[0] = batch_size; + if (tmp_size == size) { + file_stream << "shape: " << size.to_string() << " "; + file_stream << "(count: " << size.count() + << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" + << (dump_raw ? " raw data" : "") << std::endl; + } else { + file_stream << "shape: " << tmp_size.to_string() << " "; + file_stream << "(count: " << tmp_size.count() + << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) + << ", original shape: " << size.to_string() << ")" + << (dump_raw ? " raw data" : "") << std::endl; + } + + if (size.count() == 0) { + file_stream << "Empty buffer" << std::endl; + return; + } + + mem_lock lock(mem, stream); + auto mem_ptr = lock.data(); + auto x_pitch = get_x_pitch(mem->get_layout()); + std::stringstream buffer; + + if (!dump_raw) { + for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) { + for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) { + for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) { + for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) { + for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) { + for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) { + cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w)); + size_t input_it = mem->get_layout().get_linear_offset(t); + + for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) { + buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl; + } + } + } + } + } + } + } + } else { + for (size_t i = 0; i < lock.size(); ++i) { + buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl; + } + } + file_stream << buffer.str(); +} + +void unpack(cldnn::data_types type, uint8_t input, int8_t &v0, int8_t &v1) { + if (type == cldnn::data_types::i4) { + char s_bit = (input & 0x08); + char mask = s_bit > 0 ? 0xF0 : 0x00; + v0 = (input & 0x0F) | mask; + + input >>= 4; + s_bit = (input & 0x08); + mask = s_bit > 0 ? 0xF0 : 0x00; + v1 = (input & 0x0F) | mask; + } else if (type == cldnn::data_types::u4) { + v0 = input & 0x0F; + v1 = input >> 4; + } else { + OPENVINO_ASSERT(false, "not supported unpacking"); + } +} + +void dump_i4u4(cldnn::data_types type, memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) { + auto&& size = mem->get_layout().get_tensor(); + + GPU_DEBUG_GET_INSTANCE(debug_config); + auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1); + tensor tmp_size(size); + tmp_size.batch[0] = batch_size; + if (tmp_size == size) { + file_stream << "shape: " << size.to_string() << " "; + file_stream << "(count: " << size.count() + << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" + << (dump_raw ? " raw data" : "") << std::endl; + } else { + file_stream << "shape: " << tmp_size.to_string() << " "; + file_stream << "(count: " << tmp_size.count() + << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) + << ", original shape: " << size.to_string() << ")" + << (dump_raw ? " raw data" : "") << std::endl; + } + + if (size.count() == 0) { + file_stream << "Empty buffer" << std::endl; + return; + } + + mem_lock lock(mem, stream); + auto mem_ptr = lock.data(); + std::stringstream buffer; + + if (dump_raw) { + for (size_t i = 0; i < lock.size(); ++i) { + int8_t v0, v1; + unpack(type, mem_ptr[i], v0, v1); + buffer << std::fixed << std::setprecision(6) << static_cast(v0) << std::endl; + buffer << std::fixed << std::setprecision(6) << static_cast(v1) << std::endl; + } + } else { + std::cout << __func__ << " supports raw dump only" << std::endl; + } + file_stream << buffer.str(); +} + +void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std::string layerName, bool dump_raw) { + std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl; + GPU_DEBUG_GET_INSTANCE(debug_config); + std::string filename = debug_config->get_name_for_dump(layerName); + filename = debug_config->dump_layers_path + filename + ".txt"; + std::ofstream file_stream(filename); + if (!mem) { + file_stream << "Empty" << std::endl; + return; + } + + // Reinterpret buffer to represent actual data layout + auto actual_mem = mem->get_engine()->reinterpret_buffer(*mem, data_layout); + + auto mem_dt = actual_mem->get_layout().data_type; + if (mem_dt == cldnn::data_types::f32) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::f16) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::i64) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::i32) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::i8) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::u8) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::u8) + dump(actual_mem, stream, file_stream, dump_raw); + else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4) + dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw); + else + std::cout << "Dump for this data type is not supported: " << dt_to_str(mem_dt) << std::endl; +} + +} // namespace + +static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) { + std::string filename; + std::string data_type = ov::element::Type(layout.data_type).get_type_name(); + std::string format = layout.format.to_string(); + std::string tensor; + auto dims = layout.get_dims(); + for (size_t r = 0 ; r < layout.get_rank() ; r++) { + tensor += ("_" + to_string(dims[r])); + } + +#ifdef GPU_DEBUG_CONFIG + GPU_DEBUG_GET_INSTANCE(debug_config); + std::string layer_name = debug_config->get_name_for_dump(name); + filename = debug_config->dump_layers_path + layer_name + + "__" + data_type + "_" + tensor + "__" + format + ".bin"; +#endif + return filename; +} + +NodeDebugHelper::NodeDebugHelper(const primitive_inst& inst) + : m_inst(inst) + , m_stream(inst.get_network().get_stream()) + , m_network(inst.get_network()) + , m_program(inst.get_network().get_program().get()) + , m_iter(m_network.iteration) { + // Load binary dump for input layers + if (!debug_config->load_layers_raw_dump.empty()) { + const std::string layer_name = m_inst.id(); + auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name); + if (!files.empty()) { + if (m_inst.is_input()) { + // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists + auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__"); + OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer"); + + OPENVINO_ASSERT(files.size() == m_inst.outputs_memory_count(), "Mis-match dump file count"); + + for (size_t i = 0; i < m_inst.outputs_memory_count(); i++) { + auto dump_file = files[0]; + if (files.size() > 1 || m_inst.outputs_memory_count() != 1) { + std::string pattern = "_dst" + std::to_string(i) + "__"; + dump_file = debug_config->get_matched_from_filelist(files, pattern); + } + OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump"); + GPU_DEBUG_COUT << " Load binary dump : " << dump_file << " for " << layer_name << std::endl; + + std::vector bin = ov::util::load_binary(dump_file); + OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file); + + auto output_mem = m_inst.output_memory_ptr(i); + OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name + + "\n Expected size : " + to_string(output_mem->size()) + ", Binary : " + to_string(bin.size())); + + output_mem->copy_from(m_stream, static_cast(&bin[0]), true); + } + } else { + auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__"); + OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name); + + // Loading input tensors for any layer + auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__"); + OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name); + + for (size_t i = 0; i < m_inst.dependencies().size(); i++) { + auto dump_file = files[0]; + if (files.size() > 1 || m_inst.dependencies().size() != 1) { + std::string pattern = "_src" + std::to_string(i) + "__"; + dump_file = debug_config->get_matched_from_filelist(files, pattern); + } + if (dump_file.length() == 0) { + GPU_DEBUG_COUT << " Skip loading for input(" << i << ") of " << layer_name << std::endl; + continue; + } + OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input"); + GPU_DEBUG_COUT << " Load binary dump : " << dump_file << " for input(" << i << ") of " << layer_name << std::endl; + + std::vector bin = ov::util::load_binary(dump_file); + OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file); + + auto input_mem = m_inst.dep_memory_ptr(i); + if (input_mem->size() != bin.size()) { + std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name + << " " << input_mem->size() << " / " << bin.size() << std::endl; + bin.resize(input_mem->size()); + } + + input_mem->copy_from(m_stream, static_cast(&bin[0]), true); + } + } + } + } + + // Dump input buffers of 'inst' + if (debug_config->dump_layers_path.length() > 0) { + const std::string layer_name = inst.id(); + + if (debug_config->is_target_iteration(m_iter) && + debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) { + std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":"; + for (size_t i = 0; i < m_inst.dependencies().size(); i++) { + std::string name = get_file_prefix() + layer_name + "_src" + std::to_string(i); + auto input_mem = m_inst.dep_memory_ptr(i); + if (input_mem == nullptr) { + GPU_DEBUG_COUT << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl; + continue; + } + + auto dep = m_inst.dependencies().at(i); + auto input_layout = dep.first->get_output_layout(dep.second); + GPU_DEBUG_IF(debug_config->dump_layers_binary) { + // Binary dump : raw + auto filename = get_file_path_for_binary_dump(input_layout, name); + + mem_lock lock(input_mem, m_stream); + ov::util::save_binary(filename, lock.data(), input_mem->size()); + GPU_DEBUG_COUT << " Dump layer src : " << layer_name << " to " << filename << std::endl; + debug_str_for_bin_load += (filename + ","); + } else { + log_memory_to_file(input_mem, + input_layout, + m_stream, + name, + debug_config->dump_layers_raw); + } + } + + if (debug_config->dump_layers_binary && !inst.is_input()) { + debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"'; + GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl; + } + } + } +} + + +NodeDebugHelper::~NodeDebugHelper() { + // Dump output buffers of 'inst' + if (debug_config->dump_layers_path.length() > 0) { + m_stream.finish(); + const std::string layer_name = m_inst.id(); + + GPU_DEBUG_IF(debug_config->is_target_iteration(m_iter) && + debug_config->is_layer_for_dumping(layer_name, m_inst.is_output(), m_inst.is_input())) { + std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + + layer_name + ":"; + for (size_t i = 0; i < m_inst.outputs_memory_count(); i++) { + std::string name = get_file_prefix() + "_dst" + std::to_string(i); + auto output_mem = m_inst.output_memory_ptr(i); + if (output_mem == nullptr) { + GPU_DEBUG_COUT << " output_mem is nullptr. Nothing to dump." << std::endl; + continue; + } + + GPU_DEBUG_IF(debug_config->dump_layers_binary) { + // Binary dump : raw + auto output_layout = m_inst.get_output_layout(i); + auto filename = get_file_path_for_binary_dump(output_layout, name); + + mem_lock lock(output_mem, m_stream); + ov::util::save_binary(filename, lock.data(), output_mem->size()); + GPU_DEBUG_COUT << " Dump layer dst : " << layer_name << " to " << filename << std::endl; + debug_str_for_bin_load += (filename + ","); + } else { + // Text dump + log_memory_to_file(output_mem, m_inst.get_output_layout(i), m_stream, name, debug_config->dump_layers_raw); + } + } + + GPU_DEBUG_IF(debug_config->dump_layers_binary && m_inst.is_input()) { + debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"'; + GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;; + } + } + } +} + +NetworkDebugHelper::NetworkDebugHelper(const network& net) + : m_network(net) + , m_iter(net.iteration) { + auto net_id = m_network.get_id(); + GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) { + auto& iters = debug_config->dump_memory_pool_iters; + if (iters.empty() || iters.find(m_iter) != iters.end()) { + GPU_DEBUG_COUT << "============================================================================" << std::endl; + GPU_DEBUG_COUT << "Start network execution (net_id : " << net_id << ", iter :" << m_iter << ")" << std::endl; + if (m_iter == 0 && net_id > 0) { + dump_memory_pool(debug_config->dump_memory_pool_path, m_iter); + GPU_DEBUG_COUT << "============================================================================" << std::endl; + } + } + } else { + GPU_DEBUG_TRACE << "============================================================================" << std::endl; + GPU_DEBUG_TRACE << "Start network execution (net_id : " << net_id << ", iter :" << m_iter << ")" << std::endl; + } + + if (debug_config->list_layers == 1) { + for (auto& inst : m_network._exec_order) { + GPU_DEBUG_COUT << inst->id() << std::endl; + if (inst->get_node().is_type()) { + auto& loop_node = inst->get_node().as(); + for (auto& prim : loop_node.get_body_program()->get_processing_order()) { + GPU_DEBUG_COUT << "\t" << prim->id() << std::endl; + } + } else if (inst->get_node().is_type()) { + auto& cond_node = inst->get_node().as(); + GPU_DEBUG_COUT << "* Branch_True" << std::endl; + for (auto& prim : cond_node.get_branch_true().inner_program->get_processing_order()) { + GPU_DEBUG_COUT << "\t" << prim->id() << std::endl; + } + GPU_DEBUG_COUT << "* Branch_False" << std::endl; + for (auto& prim : cond_node.get_branch_false().inner_program->get_processing_order()) { + GPU_DEBUG_COUT << "\t" << prim->id() << std::endl; + } + } + } + + if (!m_network.is_internal()) + exit(0); + } +} + +NetworkDebugHelper::~NetworkDebugHelper() { + auto prog = m_network.get_program().get(); + auto net_id = m_network.get_id(); + // print '-data_shape' option for benchmark_app + if (debug_config->print_input_data_shapes == 1) { + std::stringstream data_shape_str; + auto add_string = [&data_shape_str](std::string str) { + data_shape_str << ((data_shape_str.rdbuf()->in_avail() == 0) ? " -data_shape " : ",") << str; + }; + + for (auto& inst : m_network._exec_order) { + auto name = inst->id(); + auto pos = name.find(':'); + auto type = name.substr(0, pos); + name.erase(0, pos + 1); + if (inst->is_input() && type == "parameter") { + add_string(name + inst->get_output_layout().get_partial_shape().to_string()); + } + } + + GPU_DEBUG_COUT << "[program:" << std::setw(2) << ((prog != nullptr) ? prog->get_id() : 0) + << "|network:" << std::setw(2) << net_id << "|iter:" << std::setw(4) << m_iter << "] benchmark_app cmd: " + << data_shape_str.str() << std::endl; + } + + if (!debug_config->dump_graphs.empty() && debug_config->is_target_iteration(m_iter)) { + auto get_fixed_str = [](int value, int length = 2) -> std::string { + std::ostringstream ss; + ss << std::setw(length) << std::setfill('0') << std::to_string(value); + return ss.str(); + }; + std::string path = get_dir_path(m_network.get_config()); + if (!path.empty()) { + std::ofstream ofs(path + "cldnn_program_exec_p" + get_fixed_str(prog->get_id()) + "_n" + get_fixed_str(net_id) + + "_" + get_fixed_str(m_iter, 5) + ".graph"); + dump_graph_init(ofs, *prog, [this](const primitive_id& id) -> std::shared_ptr { + return m_network.get_primitive(id); + }); + } + } + + if (debug_config->dump_memory_pool > 0) { + auto& iters = debug_config->dump_memory_pool_iters; + if (iters.empty() || iters.find(m_iter) != iters.end()) { + dump_memory_pool(debug_config->dump_memory_pool_path, m_iter); + GPU_DEBUG_COUT << "============================================================================" << std::endl; + } + } + + m_network.iteration++; +} + +void NetworkDebugHelper::dump_memory_pool(std::string dump_path, int64_t curr_iter) const { + m_network.get_memory_pool().dump(m_network.get_id(), curr_iter, dump_path); + auto get_constants_mem_size = [&](allocation_type type) -> size_t { + size_t mem_size = 0; + for (auto& prim : m_network._primitives) { + if (prim.second->get_node().is_constant()) { + for (size_t i = 0; i < prim.second->outputs_memory_count(); i++) { + if (prim.second->output_memory_ptr(i)->get_allocation_type() == type) + mem_size += prim.second->output_memory_ptr(i)->size(); + } + } + } + return mem_size; + }; + auto get_variables_mem_size = [&](allocation_type type) -> size_t { + size_t mem_size = 0; + for (auto& var : m_network.get_variables()) { + if (var.second->get_memory() && var.second->get_memory()->get_allocation_type() == type) + mem_size += var.second->get_actual_mem_size(); + } + return mem_size; + }; + auto get_mb_size = [&](int64_t size) -> std::string { + if (size == 0) return "0 MB"; + return std::to_string(static_cast(size) / (1024 * 1024)) + " MB"; + }; + int64_t usm_host_const_mem_size = get_constants_mem_size(allocation_type::usm_host); + int64_t usm_device_const_mem_size = get_constants_mem_size(allocation_type::usm_device); + int64_t usm_host_var_mem_size = get_variables_mem_size(allocation_type::usm_host); + int64_t usm_device_var_mem_size = get_variables_mem_size(allocation_type::usm_device); + int64_t host_mem_size = m_network.get_engine().get_used_device_memory(allocation_type::usm_host); + int64_t device_mem_size = m_network.get_engine().get_used_device_memory(allocation_type::usm_device); + int64_t usm_host_mem_pool_size = m_network.get_memory_pool().get_total_mem_pool_size(allocation_type::usm_host); + int64_t usm_host_etc_size = host_mem_size - usm_host_mem_pool_size + - usm_host_const_mem_size - usm_host_var_mem_size; + int64_t usm_device_mem_pool_size = m_network.get_memory_pool().get_total_mem_pool_size(allocation_type::usm_device); + int64_t usm_device_etc_size = device_mem_size - usm_device_mem_pool_size + - usm_device_const_mem_size - usm_device_var_mem_size; + GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl; + GPU_DEBUG_COUT << "Memory statistics for (net_id:" << m_network.get_id() << ", iter:" << curr_iter << ")" << std::endl; + GPU_DEBUG_COUT << " Total host mem size : " << get_mb_size(host_mem_size) << std::endl; + GPU_DEBUG_COUT << " * Memory pool : " << get_mb_size(usm_host_mem_pool_size) << std::endl; + GPU_DEBUG_COUT << " * Constant : " << get_mb_size(usm_host_const_mem_size) << std::endl; + GPU_DEBUG_COUT << " * Variable : " << get_mb_size(usm_host_var_mem_size) << std::endl; + GPU_DEBUG_COUT << " * ETC : " << get_mb_size(usm_host_etc_size) << std::endl; + GPU_DEBUG_COUT << " Total device mem size : " << get_mb_size(device_mem_size) << std::endl; + GPU_DEBUG_COUT << " * Memory pool : " << get_mb_size(usm_device_mem_pool_size) << std::endl; + GPU_DEBUG_COUT << " * Constant : " << get_mb_size(usm_device_const_mem_size) << std::endl; + GPU_DEBUG_COUT << " * Variable : " << get_mb_size(usm_device_var_mem_size) << std::endl; + GPU_DEBUG_COUT << " * ETC : " << get_mb_size(usm_device_etc_size) << std::endl; + GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl; +} + +} // namespace cldnn + +#endif // GPU_DEBUG_CONFIG diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.hpp b/src/plugins/intel_gpu/src/graph/debug_helper.hpp new file mode 100644 index 00000000000000..c7c6bd006af1db --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/debug_helper.hpp @@ -0,0 +1,69 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "intel_gpu/graph/network.hpp" +#include "intel_gpu/graph/program.hpp" +#include "intel_gpu/runtime/stream.hpp" +#include "intel_gpu/runtime/debug_configuration.hpp" +#include "primitive_inst.h" + +namespace cldnn { + +#ifdef GPU_DEBUG_CONFIG + +class NodeDebugHelper { +public: + NodeDebugHelper(const primitive_inst& inst); + ~NodeDebugHelper(); + +private: + std::string get_iteration_prefix() { + if (m_iter < 0) + return std::string(""); + return std::to_string(m_iter) + "_"; + } + + std::string get_file_prefix() { + auto prog_id = ((m_program != nullptr) ? m_program->get_id() : 0); + auto net_id = m_network.get_id(); + + return "program" + std::to_string(prog_id) + "_network" + std::to_string(net_id) + "_" + get_iteration_prefix() + m_inst.id(); + } + + + const primitive_inst& m_inst; + stream& m_stream; + const network& m_network; + const program* m_program; + const size_t m_iter; + + const debug_configuration* debug_config = cldnn ::debug_configuration ::get_instance(); +}; + +class NetworkDebugHelper { +public: + NetworkDebugHelper(const network& net); + ~NetworkDebugHelper(); + +private: + void dump_memory_pool(std::string dump_path, int64_t curr_iter) const; + const network& m_network; + const size_t m_iter; + + const debug_configuration* debug_config = cldnn ::debug_configuration ::get_instance(); +}; + +#define NETWORK_DEBUG(net) NetworkDebugHelper __network_debug_helper(net) +#define NODE_DEBUG(inst) NodeDebugHelper __node_debug_helper(inst) + +#else + +#define NETWORK_DEBUG(...) +#define NODE_DEBUG(...) + +#endif // GPU_DEBUG_CONFIG + +} // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h b/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h index 075422a4196b38..cf5111de6b247e 100644 --- a/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h +++ b/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h @@ -14,6 +14,6 @@ std::string get_dir_path(const ExecutionConfig& config); void dump_graph_optimized(std::ofstream&, const program&); void dump_graph_processing_order(std::ofstream&, const program&); void dump_graph_init(std::ofstream&, const program&, - std::function(const primitive_id&)> get_primitive_inst = nullptr); + std::function(const primitive_id&)> get_primitive_inst = nullptr); void dump_graph_info(std::ofstream&, const program&); } // namespace cldnn diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp index ae64846a0c9b5e..57f2fb41c7cc06 100644 --- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp +++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp @@ -1089,11 +1089,13 @@ format layout_optimizer::get_expected_format(quantize_node const& node) { auto use_onednn_impls = _optimization_attributes.use_onednn_impls; if (use_onednn_impls) { - auto& user = node.get_users().front(); - if (user != nullptr && user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) { - expected = user->get_preferred_input_fmt(user->get_dependency_index(node)); - } else { - expected = format::any; + expected = format::any; + auto& users = node.get_users(); + if (users.size() != 0) { + auto& user = users.front(); + if (user != nullptr && user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) { + expected = user->get_preferred_input_fmt(user->get_dependency_index(node)); + } } } else if (only_gemm_users(node)) { // TODO: Gemm is not supporting fsv layouts diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp index 92d62782828d78..8f0e97dd51ee12 100644 --- a/src/plugins/intel_gpu/src/graph/network.cpp +++ b/src/plugins/intel_gpu/src/graph/network.cpp @@ -4,7 +4,6 @@ #include "intel_gpu/plugin/variable_state.hpp" #include "intel_gpu/primitives/read_value.hpp" -#include "openvino/util/file_util.hpp" #include "intel_gpu/primitives/data.hpp" #include "intel_gpu/primitives/mutable_data.hpp" @@ -31,13 +30,10 @@ #include "deconvolution_inst.h" #include "mutable_data_inst.h" #include "condition_inst.h" -#include "loop_inst.h" -#include "assign_inst.h" #include "read_value_inst.h" #include "reshape_inst.h" #include "kv_cache_inst.h" #include "program_helpers.h" -#include "to_string_utils.h" #include "program_dump_graph.h" #include @@ -51,8 +47,8 @@ #include #include +#include "debug_helper.hpp" #ifdef GPU_DEBUG_CONFIG -#include #include #include #include @@ -60,7 +56,6 @@ #endif namespace cldnn { - namespace { #ifdef GPU_DEBUG_CONFIG @@ -143,179 +138,6 @@ void dump_perf_data_raw(std::string dump_path, const std::list(i); } -float convert_element(int32_t i) { return static_cast(i); } - -float convert_element(float f) { return f; } - -float convert_element(ov::float16 h) { return static_cast(h); } - -size_t get_x_pitch(const layout& layout) { - try { - auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0)); - auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0)); - auto x0 = layout.get_linear_offset(tensor_x0); - auto x1 = layout.get_linear_offset(tensor_x1); - return (x1 - x0); - } catch (...) { - // When spatial size of x=0, x_pitch is meaningless - return 0; - } -} - -template -void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) { - auto&& size = mem->get_layout().get_tensor(); - - GPU_DEBUG_GET_INSTANCE(debug_config); - auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1); - tensor tmp_size(size); - tmp_size.batch[0] = batch_size; - if (tmp_size == size) { - file_stream << "shape: " << size.to_string() << " "; - file_stream << "(count: " << size.count() - << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" - << (dump_raw ? " raw data" : "") << std::endl; - } else { - file_stream << "shape: " << tmp_size.to_string() << " "; - file_stream << "(count: " << tmp_size.count() - << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) - << ", original shape: " << size.to_string() << ")" - << (dump_raw ? " raw data" : "") << std::endl; - } - - if (size.count() == 0) { - file_stream << "Empty buffer" << std::endl; - return; - } - - mem_lock lock(mem, stream); - auto mem_ptr = lock.data(); - auto x_pitch = get_x_pitch(mem->get_layout()); - std::stringstream buffer; - - if (!dump_raw) { - for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) { - for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) { - for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) { - for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) { - for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) { - for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) { - cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w)); - size_t input_it = mem->get_layout().get_linear_offset(t); - - for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) { - buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl; - } - } - } - } - } - } - } - } else { - for (size_t i = 0; i < lock.size(); ++i) { - buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl; - } - } - file_stream << buffer.str(); -} - -void unpack(cldnn::data_types type, uint8_t input, int8_t &v0, int8_t &v1) { - if (type == cldnn::data_types::i4) { - char s_bit = (input & 0x08); - char mask = s_bit > 0 ? 0xF0 : 0x00; - v0 = (input & 0x0F) | mask; - - input >>= 4; - s_bit = (input & 0x08); - mask = s_bit > 0 ? 0xF0 : 0x00; - v1 = (input & 0x0F) | mask; - } else if (type == cldnn::data_types::u4) { - v0 = input & 0x0F; - v1 = input >> 4; - } else { - OPENVINO_ASSERT(false, "not supported unpacking"); - } -} - -void dump_i4u4(cldnn::data_types type, memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) { - auto&& size = mem->get_layout().get_tensor(); - - GPU_DEBUG_GET_INSTANCE(debug_config); - auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1); - tensor tmp_size(size); - tmp_size.batch[0] = batch_size; - if (tmp_size == size) { - file_stream << "shape: " << size.to_string() << " "; - file_stream << "(count: " << size.count() - << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")" - << (dump_raw ? " raw data" : "") << std::endl; - } else { - file_stream << "shape: " << tmp_size.to_string() << " "; - file_stream << "(count: " << tmp_size.count() - << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) - << ", original shape: " << size.to_string() << ")" - << (dump_raw ? " raw data" : "") << std::endl; - } - - if (size.count() == 0) { - file_stream << "Empty buffer" << std::endl; - return; - } - - mem_lock lock(mem, stream); - auto mem_ptr = lock.data(); - std::stringstream buffer; - - if (dump_raw) { - for (size_t i = 0; i < lock.size(); ++i) { - int8_t v0, v1; - unpack(type, mem_ptr[i], v0, v1); - buffer << std::fixed << std::setprecision(6) << static_cast(v0) << std::endl; - buffer << std::fixed << std::setprecision(6) << static_cast(v1) << std::endl; - } - } else { - std::cout << __func__ << " supports raw dump only" << std::endl; - } - file_stream << buffer.str(); -} - -void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std::string layerName, bool dump_raw) { - std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl; - GPU_DEBUG_GET_INSTANCE(debug_config); - std::string filename = debug_config->get_name_for_dump(layerName); - filename = debug_config->dump_layers_path + filename + ".txt"; - std::ofstream file_stream(filename); - if (!mem) { - file_stream << "Empty" << std::endl; - return; - } - - // Reinterpret buffer to represent actual data layout - auto actual_mem = mem->get_engine()->reinterpret_buffer(*mem, data_layout); - - auto mem_dt = actual_mem->get_layout().data_type; - if (mem_dt == cldnn::data_types::f32) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::f16) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::i64) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::i32) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::i8) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::u8) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::u8) - dump(actual_mem, stream, file_stream, dump_raw); - else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4) - dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw); - else - std::cout << "Dump for this data type is not supported: " << dt_to_str(mem_dt) << std::endl; -} - void wait_for_the_turn() { GPU_DEBUG_GET_INSTANCE(debug_config); bool need_to_wait; @@ -336,7 +158,6 @@ void wait_for_the_turn() { #else void dump_perf_data_raw(std::string, const std::list>&) {} -void log_memory_to_file(memory::ptr, layout, stream&, std::string, bool dump_raw) {} void wait_for_the_turn() {} #endif } // namespace @@ -346,25 +167,6 @@ static uint32_t get_unique_net_id() { return ++id_gen; } -static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) { - std::string filename; - std::string data_type = ov::element::Type(layout.data_type).get_type_name(); - std::string format = layout.format.to_string(); - std::string tensor; - auto dims = layout.get_dims(); - for (size_t r = 0 ; r < layout.get_rank() ; r++) { - tensor += ("_" + to_string(dims[r])); - } - -#ifdef GPU_DEBUG_CONFIG - GPU_DEBUG_GET_INSTANCE(debug_config); - std::string layer_name = debug_config->get_name_for_dump(name); - filename = debug_config->dump_layers_path + layer_name - + "__" + data_type + "_" + tensor + "__" + format + ".bin"; -#endif - return filename; -} - /* Network will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants opt pass). @@ -939,28 +741,10 @@ std::map network::execute(const std::vector& events) { OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute"); - int64_t curr_iter = -1; - GPU_DEBUG_GET_INSTANCE(debug_config); -#ifdef GPU_DEBUG_CONFIG - curr_iter = iteration; -#endif + NETWORK_DEBUG(*this); // Wait for previous execution completion reset_execution(false); - GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) { - auto& iters = debug_config->dump_memory_pool_iters; - if (iters.empty() || iters.find(curr_iter) != iters.end()) { - GPU_DEBUG_COUT << "============================================================================" << std::endl; - GPU_DEBUG_COUT << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl; - if (curr_iter == 0 && get_id() > 0) { - dump_memory_pool(debug_config->dump_memory_pool_path, curr_iter); - GPU_DEBUG_COUT << "============================================================================" << std::endl; - } - } - } else { - GPU_DEBUG_TRACE << "============================================================================" << std::endl; - GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl; - } std::vector in_out_mem; auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) { @@ -996,33 +780,6 @@ void network::execute_impl(const std::vector& events) { auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream()); set_arguments(); - GPU_DEBUG_IF(debug_config->list_layers == 1) { - for (auto& inst : _exec_order) { - GPU_DEBUG_COUT << inst->id() << std::endl; - if (inst->get_node().is_type()) { - auto& loop_node = inst->get_node().as(); - for (auto& prim : loop_node.get_body_program()->get_processing_order()) { - GPU_DEBUG_COUT << "\t" << prim->id() << std::endl; - } - } else if (inst->get_node().is_type()) { - auto& cond_node = inst->get_node().as(); - GPU_DEBUG_COUT << "* Branch_True" << std::endl; - for (auto& prim : cond_node.get_branch_true().inner_program->get_processing_order()) { - GPU_DEBUG_COUT << "\t" << prim->id() << std::endl; - } - GPU_DEBUG_COUT << "* Branch_False" << std::endl; - for (auto& prim : cond_node.get_branch_false().inner_program->get_processing_order()) { - GPU_DEBUG_COUT << "\t" << prim->id() << std::endl; - } - } - } - if (!is_internal()) exit(0); - } - auto get_iteration_prefix = [](int64_t iter) { - if (iter < 0) - return std::string(""); - return std::to_string(iter) + "_"; - }; // This extra flush command is needed for dynamic models in both cases of out_of_order / in_order operating mode // since it reduces `bubbles` number in pipeline and GPU's idle time by timely flushing new kernels to device. @@ -1033,233 +790,43 @@ void network::execute_impl(const std::vector& events) { size_t executed_prims = 0; for (auto& inst : _exec_order) { - // Load binary dump for input layers - GPU_DEBUG_IF(!debug_config->load_layers_raw_dump.empty()) { - const std::string layer_name = inst->id(); - auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name); - if (!files.empty()) { - if (inst->is_input()) { - // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists - auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__"); - OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer"); - - OPENVINO_ASSERT(files.size() == get_primitive(inst->id())->outputs_memory_count(), "Mis-match dump file count"); - - for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) { - auto dump_file = files[0]; - if (files.size() > 1 || get_primitive(inst->id())->outputs_memory_count() != 1) { - std::string pattern = "_dst" + std::to_string(i) + "__"; - dump_file = debug_config->get_matched_from_filelist(files, pattern); - } - OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump"); - GPU_DEBUG_COUT << " Load binary dump : " << dump_file << " for " << layer_name << std::endl; - - std::vector bin = ov::util::load_binary(dump_file); - OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file); - - auto output_mem = get_primitive(layer_name)->output_memory_ptr(i); - OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name - + "\n Expected size : " + to_string(output_mem->size()) + ", Binary : " + to_string(bin.size())); - - output_mem->copy_from(get_stream(), static_cast(&bin[0]), true); - } - } else { - auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__"); - OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name); - - // Loading input tensors for any layer - auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__"); - OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name); - - for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) { - auto dump_file = files[0]; - if (files.size() > 1 || get_primitive(inst->id())->dependencies().size() != 1) { - std::string pattern = "_src" + std::to_string(i) + "__"; - dump_file = debug_config->get_matched_from_filelist(files, pattern); - } - if (dump_file.length() == 0) { - GPU_DEBUG_COUT << " Skip loading for input(" << i << ") of " << layer_name << std::endl; - continue; - } - OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input"); - GPU_DEBUG_COUT << " Load binary dump : " << dump_file << " for input(" << i << ") of " << layer_name << std::endl; - - std::vector bin = ov::util::load_binary(dump_file); - OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file); - - auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i); - if (input_mem->size() != bin.size()) { - std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name - << " " << input_mem->size() << " / " << bin.size() << std::endl; - bin.resize(input_mem->size()); - } - - input_mem->copy_from(get_stream(), static_cast(&bin[0]), true); - } - } - } - } - - // Dump input buffers of 'inst' - GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { - const std::string layer_name = inst->id(); - - GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) && - debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) { - std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":"; - for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) { - std::string name = "program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 0) + - "_network" + std::to_string(get_id()) + - "_" + get_iteration_prefix(curr_iter) + - layer_name + "_src" + std::to_string(i); - auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i); - if (input_mem == nullptr) { - GPU_DEBUG_COUT << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl; - continue; - } - - auto dep = inst->dependencies().at(i); - auto input_layout = dep.first->get_output_layout(dep.second); - GPU_DEBUG_IF(debug_config->dump_layers_binary) { - // Binary dump : raw - auto filename = get_file_path_for_binary_dump(input_layout, name); - - mem_lock lock(input_mem, get_stream()); - ov::util::save_binary(filename, lock.data(), input_mem->size()); - GPU_DEBUG_COUT << " Dump layer src : " << layer_name << " to " << filename << std::endl; - debug_str_for_bin_load += (filename + ","); - } else { - log_memory_to_file(input_mem, - input_layout, - get_stream(), - name, - debug_config->dump_layers_raw); - } - } - - GPU_DEBUG_IF(debug_config->dump_layers_binary && !inst->is_input()) { - debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"'; - GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;; - } - } - } + NODE_DEBUG(*inst); execute_primitive(inst, events); executed_prims++; if (needs_flushing && executed_prims % flush_frequency == 0) get_stream().flush(); - - // Dump output buffers of 'inst' - GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) { - get_stream().finish(); - const std::string layer_name = inst->id(); - auto prog_id = ((get_program() != nullptr) ? get_program()->get_id() : 0); - auto net_id = get_id(); - GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) && - debug_config->is_layer_for_dumping(layer_name, inst->is_output(), inst->is_input())) { - std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" - + layer_name + ":"; - for (size_t i = 0; i < get_primitive(layer_name)->outputs_memory_count(); i++) { - std::string name = "program" + std::to_string(prog_id) + - "_network" + std::to_string(net_id) + - "_" + get_iteration_prefix(curr_iter) + - layer_name + "_dst" + std::to_string(i); - auto output_mem = get_primitive(layer_name)->output_memory_ptr(i); - if (output_mem == nullptr) { - GPU_DEBUG_COUT << " output_mem is nullptr. Nothing to dump." << std::endl; - continue; - } - - GPU_DEBUG_IF(debug_config->dump_layers_binary) { - // Binary dump : raw - auto output_layout = inst->get_output_layout(i); - auto filename = get_file_path_for_binary_dump(output_layout, name); - - mem_lock lock(output_mem, get_stream()); - ov::util::save_binary(filename, lock.data(), output_mem->size()); - GPU_DEBUG_COUT << " Dump layer dst : " << layer_name << " to " << filename << std::endl; - debug_str_for_bin_load += (filename + ","); - } else { - // Text dump - log_memory_to_file(output_mem, inst->get_output_layout(i), get_stream(), name, debug_config->dump_layers_raw); - } - } - - GPU_DEBUG_IF(debug_config->dump_layers_binary && inst->is_input()) { - debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"'; - GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;; - } - } - } - } - - // print '-data_shape' option for benchmark_app - GPU_DEBUG_IF(debug_config->print_input_data_shapes == 1) { - std::stringstream data_shape_str; - auto add_string = [&data_shape_str](std::string str) { - data_shape_str << ((data_shape_str.rdbuf()->in_avail() == 0) ? " -data_shape " : ",") << str; - }; - - for (auto& inst : _exec_order) { - auto name = inst->id(); - auto pos = name.find(':'); - auto type = name.substr(0, pos); - name.erase(0, pos + 1); - if (inst->is_input() && type == "parameter") { - add_string(name + inst->get_output_layout().get_partial_shape().to_string()); - } - } - - GPU_DEBUG_COUT << "[program:" << std::setw(2) << ((get_program() != nullptr) ? get_program()->get_id() : 0) - << "|network:" << std::setw(2) << get_id() << "|iter:" << std::setw(4) << curr_iter << "] benchmark_app cmd: " - << data_shape_str.str() << std::endl; - } - - GPU_DEBUG_IF(!debug_config->dump_graphs.empty() && debug_config->is_target_iteration(curr_iter)) { - auto get_fixed_str = [](int value, int length = 2) -> std::string { - std::ostringstream ss; - ss << std::setw(length) << std::setfill('0') << std::to_string(value); - return ss.str(); - }; - std::string path = get_dir_path(get_config()); - if (!path.empty()) { - std::ofstream ofs(path + "cldnn_program_exec_p" + get_fixed_str(get_program()->get_id()) + "_n" + get_fixed_str(get_id()) - + "_" + get_fixed_str(curr_iter, 5) + ".graph"); - dump_graph_init(ofs, *get_program(), [&](const primitive_id& id) -> std::shared_ptr { - return get_primitive(id); - }); - } } // Store events only in case of OOO queue or enabled Profiling auto store_events = is_out_of_order_queue || _enable_profiling; if (store_events) { if (_program != nullptr) { - for (auto& inst : _program->get_processing_order()) { - // Special handling for mutable data. The event should be the same as the user or dependency with highest - // processing_num as the mutable_data can be updated when is both user or dependency. - if (inst->is_type()) { - decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0; - for (auto& user : inst->get_users()) { - auto user_proc_num = _program->get_processing_order().get_processing_number(user); - if (user_proc_num > proc_num) { - _events[inst->id()] = _events[user->id()]; - proc_num = user_proc_num; + for (auto& inst : _program->get_processing_order()) { + // Special handling for mutable data. The event should be the same as the user or dependency with highest + // processing_num as the mutable_data can be updated when is both user or dependency. + if (inst->is_type()) { + decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0; + for (auto& user : inst->get_users()) { + auto user_proc_num = _program->get_processing_order().get_processing_number(user); + if (user_proc_num > proc_num) { + _events[inst->id()] = _events[user->id()]; + proc_num = user_proc_num; + } } - } - if (!inst->get_dependencies().empty()) { - for (auto& dep : inst->get_dependencies()) { - auto dep_proc_num = _program->get_processing_order().get_processing_number(dep.first); - if (dep_proc_num > proc_num) { - _events[inst->id()] = _events[dep.first->id()]; - proc_num = dep_proc_num; + if (!inst->get_dependencies().empty()) { + for (auto& dep : inst->get_dependencies()) { + auto dep_proc_num = _program->get_processing_order().get_processing_number(dep.first); + if (dep_proc_num > proc_num) { + _events[inst->id()] = _events[dep.first->id()]; + proc_num = dep_proc_num; + } } } } } } - } for (auto& dout : _data_outputs) { // data primitives are not executed so if they are marked as output we need to add // them valid events manually @@ -1278,73 +845,6 @@ void network::execute_impl(const std::vector& events) { // Deallocate events from the previos iteration _old_events.clear(); - - GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) { - auto& iters = debug_config->dump_memory_pool_iters; - if (iters.empty() || iters.find(curr_iter) != iters.end()) { - dump_memory_pool(debug_config->dump_memory_pool_path, curr_iter); - GPU_DEBUG_COUT << "============================================================================" << std::endl; - } - } - -#ifdef GPU_DEBUG_CONFIG - iteration++; -#endif -} - -void network::dump_memory_pool(std::string dump_path, int64_t curr_iter) { -#ifdef GPU_DEBUG_CONFIG - get_memory_pool().dump(get_id(), curr_iter, dump_path); - auto get_constants_mem_size = [&](allocation_type type) -> size_t { - size_t mem_size = 0; - for (auto& prim : _primitives) { - if (prim.second->get_node().is_constant()) { - for (size_t i = 0; i < prim.second->outputs_memory_count(); i++) { - if (prim.second->output_memory_ptr(i)->get_allocation_type() == type) - mem_size += prim.second->output_memory_ptr(i)->size(); - } - } - } - return mem_size; - }; - auto get_variables_mem_size = [&](allocation_type type) -> size_t { - size_t mem_size = 0; - for (auto& var : get_variables()) { - if (var.second->get_memory() && var.second->get_memory()->get_allocation_type() == type) - mem_size += var.second->get_actual_mem_size(); - } - return mem_size; - }; - auto get_mb_size = [&](int64_t size) -> std::string { - if (size == 0) return "0 MB"; - return std::to_string(static_cast(size) / (1024 * 1024)) + " MB"; - }; - int64_t usm_host_const_mem_size = get_constants_mem_size(allocation_type::usm_host); - int64_t usm_device_const_mem_size = get_constants_mem_size(allocation_type::usm_device); - int64_t usm_host_var_mem_size = get_variables_mem_size(allocation_type::usm_host); - int64_t usm_device_var_mem_size = get_variables_mem_size(allocation_type::usm_device); - int64_t host_mem_size = get_engine().get_used_device_memory(allocation_type::usm_host); - int64_t device_mem_size = get_engine().get_used_device_memory(allocation_type::usm_device); - int64_t usm_host_mem_pool_size = get_memory_pool().get_total_mem_pool_size(allocation_type::usm_host); - int64_t usm_host_etc_size = host_mem_size - usm_host_mem_pool_size - - usm_host_const_mem_size - usm_host_var_mem_size; - int64_t usm_device_mem_pool_size = get_memory_pool().get_total_mem_pool_size(allocation_type::usm_device); - int64_t usm_device_etc_size = device_mem_size - usm_device_mem_pool_size - - usm_device_const_mem_size - usm_device_var_mem_size; - GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl; - GPU_DEBUG_COUT << "Memory statistics for (net_id:" << get_id() << ", iter:" << curr_iter << ")" << std::endl; - GPU_DEBUG_COUT << " Total host mem size : " << get_mb_size(host_mem_size) << std::endl; - GPU_DEBUG_COUT << " * Memory pool : " << get_mb_size(usm_host_mem_pool_size) << std::endl; - GPU_DEBUG_COUT << " * Constant : " << get_mb_size(usm_host_const_mem_size) << std::endl; - GPU_DEBUG_COUT << " * Variable : " << get_mb_size(usm_host_var_mem_size) << std::endl; - GPU_DEBUG_COUT << " * ETC : " << get_mb_size(usm_host_etc_size) << std::endl; - GPU_DEBUG_COUT << " Total device mem size : " << get_mb_size(device_mem_size) << std::endl; - GPU_DEBUG_COUT << " * Memory pool : " << get_mb_size(usm_device_mem_pool_size) << std::endl; - GPU_DEBUG_COUT << " * Constant : " << get_mb_size(usm_device_const_mem_size) << std::endl; - GPU_DEBUG_COUT << " * Variable : " << get_mb_size(usm_device_var_mem_size) << std::endl; - GPU_DEBUG_COUT << " * ETC : " << get_mb_size(usm_device_etc_size) << std::endl; - GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl; -#endif } std::vector network::get_input_ids() const { diff --git a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp index bff45cd81f9900..4a2f43b28d9360 100644 --- a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp +++ b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp @@ -170,7 +170,7 @@ std::string get_dir_path(const ExecutionConfig& config) { void dump_graph_init(std::ofstream& graph, const program& program, - std::function(const primitive_id&)> get_primitive_inst) { + std::function(const primitive_id&)> get_primitive_inst) { const std::string invalid_layout_msg = "(invalid layout)"; const auto dump_mem_info = [&invalid_layout_msg, &get_primitive_inst](const program_node* ptr) { diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp index 3c9ad0f7317a27..21ba4e656fae0d 100644 --- a/src/plugins/intel_gpu/src/graph/program_node.cpp +++ b/src/plugins/intel_gpu/src/graph/program_node.cpp @@ -611,9 +611,9 @@ bool program_node::is_padded_spatial(size_t idx) const { auto& layout = get_output_layout(idx); const auto& lower_size = layout.data_padding._lower_size; const auto& upper_size = layout.data_padding._upper_size; - return std::any_of(std::begin(lower_size) + 2, std::begin(lower_size) + layout.get_spatial_rank() - 1, + return std::any_of(std::begin(lower_size) + 2, std::begin(lower_size) + 2 + layout.get_spatial_rank(), [](const tensor::value_type& el) { return el != 0; }) || - std::any_of(std::begin(upper_size) + 2, std::begin(upper_size) + layout.get_spatial_rank() - 1, + std::any_of(std::begin(upper_size) + 2, std::begin(upper_size) + 2 + layout.get_spatial_rank(), [](const tensor::value_type& el) { return el != 0; }); } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 29d322d432dd35..57545b0df37cff 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET; #endif +#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + const int power_of_two_for_simd = 5; + const int power_of_two_for_osv = 6; + const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv); + const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1); + const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd; + // out_f(32) : 0 * osv_weight_stride + 32; + // out_f(64) : 64 * osv_weight_stride + 0; + // out_f(128) : 64 * osv_weight_stride + 32; + // ... + uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset; +#else uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2); +#endif ACCUMULATOR_VEC_TYPE acc[TILE_B] = { }; @@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( __local int* char_slm_weight = (__local int*)wei_local_mem; + #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2; + #else uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE; + #endif uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2; // DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE @@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); + #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2 + SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx); + SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD))); + DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked; + DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp; + dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0)); + dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1)); + dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01; + dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45; + dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23; + dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67; #else SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx); DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed)); @@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)( acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight); } - #if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2 - weights_offset += (TILE_K_OFM_PACKED/2) * SIMD; - #else - weights_offset += TILE_K_OFM_PACKED * SIMD; - #endif + weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD; #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE) unroll_for (uint bi = 0; bi < TILE_B; ++bi) { diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 5377387c8b497e..c4115d74f54a92 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para size_t tile_k_ofm_packed = tile_k_ofm; size_t quantize_grp_size = get_dynamic_quantize_group_size(params); + bool add_decompress_scale_post_op = false; WeightsType weights_dt = params.weights.GetDType(); if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) { tile_k_ofm_packed /= 2; @@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v; // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance if (scale_group_size % simd == 0 && !dispatchData.use_slm) - jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); + add_decompress_scale_post_op = true; } if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) { jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii")); @@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para jit.AddConstant(MakeJitConstant("DQ_TYPE", "char")); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size)); } else { + if (add_decompress_scale_post_op) + jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1)); jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0)); jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size)); } @@ -781,8 +784,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa auto output_f = get_output_aligned_bf_size(fc_params, false).second; WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16; - // TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed - if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 + if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16 && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2) && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4) && is_weight_horizontal(fc_params, output_f)) { diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp index 346b4471779593..88d69dcd3e47b3 100644 --- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp +++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp @@ -34,9 +34,12 @@ namespace { inline bool can_use_usm_host(const cldnn::engine& engine) { auto can_use_usm = engine.use_unified_shared_memory(); - if (engine.get_device_info().gfx_ver.major == 12 && engine.get_device_info().gfx_ver.minor == 60) { - // WA: Disable USM host memory for infer request`s tensors for PVC as - // it has performance issues in case of host <-> device data transfers inside kernels + const auto& device_info = engine.get_device_info(); + if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) || + (device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) { + // WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access + // to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine + // Driver tickets with additional details: 6155, 10054 GPU_DEBUG_TRACE << "Do not use usm_host for performance issue" << std::endl; can_use_usm = false; } diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index 43cb5ec1aef931..563e99fcf2bad9 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -178,6 +178,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr& model, } auto process_params = [&](const ov::ParameterVector& _parameters) { for (size_t i = 0; i < _parameters.size(); i++) { + NPUW_ASSERT(_parameters[i]); LOG_VERB(_parameters[i]); for (size_t j = 0; j < orig_parameters.size(); j++) { if (_parameters[i] == orig_parameters[j]) { diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp index 22dfc6e103f719..192d975509ce5e 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp @@ -4,6 +4,8 @@ #include "partitioning.hpp" +#include + #include "../logging.hpp" #include "../util.hpp" #include "intel_npu/al/config/npuw.hpp" @@ -20,6 +22,26 @@ #include "patterns/dcoff.hpp" #include "patterns/opt.hpp" +namespace ov { +namespace npuw { +inline bool operator==(const std::reference_wrapper& lhs, const std::reference_wrapper& rhs) { + ov::npuw::Subgraph& llink = lhs.get(); + ov::npuw::Subgraph& rlink = rhs.get(); + return &llink == &rlink; +} +} // namespace npuw +} // namespace ov + +template +struct std::hash> { + std::size_t operator()(std::pair const& p) const noexcept { + ov::npuw::Subgraph& sg = p.first.get(); + std::size_t h1 = std::hash{}(&sg); + std::size_t h2 = std::hash{}(p.second); + return h1 ^ (h2 << 1); + } +}; + namespace { class FuncallEverywhere { @@ -161,6 +183,8 @@ class Partitioner { using PPtr = std::shared_ptr; using RPtr = std::shared_ptr; + using SubgParam = std::pair; + using SubgResult = std::pair; using LinkPtrTo = std::pair param_call_to_proto; - std::unordered_map result_call_to_proto; + std::unordered_map param_call_to_proto; + std::unordered_map result_call_to_proto; }; std::map all_functions; @@ -203,7 +227,10 @@ class Partitioner { void createFunction(FunctionPipeline& func_ggg); template - void rearrange_to_function_protocol(const std::vector& protocol, std::vector& call, const M& call_to_proto) { + void rearrange_to_function_protocol(ov::npuw::Subgraph::Ref func_ref, + const std::vector& protocol, + std::vector& call, + const M& call_to_proto) { LOG_DEBUG("Rearranging..."); LOG_BLOCK(); LOG_DEBUG("Protocol: " << protocol.size()); @@ -215,7 +242,7 @@ class Partitioner { LOG_DEBUG("Call: " << call.size()); for (auto&& c : call) { LOG_BLOCK(); - auto p_c = call_to_proto.at(c); + auto p_c = call_to_proto.at(typename M::key_type(func_ref, c)); to_proto.push_back(p_c); LOG_DEBUG(c << " (which is " << p_c << ")"); } @@ -536,7 +563,7 @@ void Partitioner::identifySubgraphs() { LOG_VERB("Processing group's output layer " << output_layer_name); LOG_BLOCK(); auto output_layer_ptr = node_id_cache.at(output_layer_name); - if (output_layer_ptr->inputs().empty()) { + if (output_layer_ptr->outputs().empty()) { OPENVINO_THROW("The group's output layer ", output_layer_name, " has NO OUTPUTS!! - Graph contracts are broken??"); @@ -1327,9 +1354,12 @@ void Partitioner::matchParameters(const std::string& func_name) { // Now walk other submodels and match parameters with the same key // (yes, including the first one) - for (auto&& call : model_group) { + for (std::size_t call_id = 0; call_id < model_group.size(); ++call_id) { LOG_DEBUG("Handle function call..."); LOG_BLOCK(); + auto call = model_group[call_id]; + auto subg_ref = func.refs[call_id]; + std::unordered_set this_model_nodes; for (auto&& node_ptr : call->get_ordered_ops()) { this_model_nodes.insert(node_ptr.get()); @@ -1348,7 +1378,7 @@ void Partitioner::matchParameters(const std::string& func_name) { LOG_DEBUG("Find orig parameter for " << node); auto& orig_param = proto_parameters.at(pkey); auto this_param = std::dynamic_pointer_cast(node); - func.param_call_to_proto[this_param] = orig_param; + func.param_call_to_proto[SubgParam(subg_ref, this_param)] = orig_param; } } } @@ -1386,14 +1416,16 @@ void Partitioner::matchResults(const std::string& func_name) { // Now walk all submodels and match parameters with the same key // (yes, including the first one) - for (auto&& call : model_group) { + for (std::size_t call_idx = 0; call_idx < model_group.size(); ++call_idx) { + auto call = model_group[call_idx]; + auto subg_ref = func.refs[call_idx]; for (auto&& node : call->get_ordered_ops()) { if (ov::op::util::is_output(node)) { auto&& port = node->input(0).get_source_output(); RKey rkey = {layer_to_prototype.at(port.get_node()->get_friendly_name()), port.get_index()}; auto& orig_result = proto_results.at(rkey); auto this_result = std::dynamic_pointer_cast(node); - func.result_call_to_proto[this_result] = orig_result; + func.result_call_to_proto[SubgResult(subg_ref, this_result)] = orig_result; } } } @@ -1517,8 +1549,8 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) { funcall._gflops = this_sg._gflops; // duplicated code again! funcall._ops = this_sg._ops; // duplicated code again! funcall._avoid_list = this_sg._avoid_list; // duplicated code again! - rearrange_to_function_protocol(body_params, funcall._parameters, func_ggg.param_call_to_proto); - rearrange_to_function_protocol(body_results, funcall._results, func_ggg.result_call_to_proto); + rearrange_to_function_protocol(this_sg, body_params, funcall._parameters, func_ggg.param_call_to_proto); + rearrange_to_function_protocol(this_sg, body_results, funcall._results, func_ggg.result_call_to_proto); auto func_iter = P.functions.find(func_name); NPUW_ASSERT(func_iter != P.functions.end()); @@ -1883,7 +1915,7 @@ void Partitioner::finalizeLinks() { auto& params = P.functions.at(sg_desc._funcall)._model->get_parameters(); auto& proto = func_pipeline_type == FunctionPipelineType::CWAI ? ptr // no protos in the CWAI case.. - : all_functions.at(sg_desc._funcall).param_call_to_proto.at(ptr); + : all_functions.at(sg_desc._funcall).param_call_to_proto.at(SubgParam(sg_desc, ptr)); auto param_iter = std::find(params.begin(), params.end(), proto); NPUW_ASSERT(param_iter != params.end()); return std::distance(params.begin(), param_iter); @@ -1904,7 +1936,7 @@ void Partitioner::finalizeLinks() { auto& results = P.functions.at(sg_desc._funcall)._model->get_results(); auto& proto = func_pipeline_type == FunctionPipelineType::CWAI ? ptr // no protos in the CWAI case... - : all_functions.at(sg_desc._funcall).result_call_to_proto.at(ptr); + : all_functions.at(sg_desc._funcall).result_call_to_proto.at(SubgResult(sg_desc, ptr)); auto result_iter = std::find(results.begin(), results.end(), proto); NPUW_ASSERT(result_iter != results.end()); return std::distance(results.begin(), result_iter); diff --git a/src/plugins/intel_npu/tests/unit/CMakeLists.txt b/src/plugins/intel_npu/tests/unit/CMakeLists.txt index 5741a1e43c2a5b..861a0ff6a47076 100644 --- a/src/plugins/intel_npu/tests/unit/CMakeLists.txt +++ b/src/plugins/intel_npu/tests/unit/CMakeLists.txt @@ -34,12 +34,9 @@ ov_add_test_target( NPUW ) -if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU") - target_compile_options(${TARGET_NAME} PRIVATE -mavx2 -mf16c) -elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC") - target_compile_options(${TARGET_NAME} PRIVATE /arch:AVX2) -else() - message(AUTHOR_WARNING "Unknown compiler, may miss the AVX2 baseline setting") +if(ENABLE_AVX2) + ov_avx2_optimization_flags(avx2_flags) + target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags}") endif() install(TARGETS ${TARGET_NAME} diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp index 51285c8145ceb6..1049832f6ead7c 100644 --- a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp +++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp @@ -2,6 +2,7 @@ // SPDX-License-Identifier: Apache-2.0 // +#ifdef HAVE_AVX2 #include "unpack.hpp" namespace { @@ -98,3 +99,5 @@ INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest3, UnpackTestsWithS UnpackTestsWithScaleAndZeroPointTest3::getTestCaseName); } // anonymous namespace + +#endif // __AVX2__ diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp index 3b3009bb5f459c..4018982b022ed3 100644 --- a/src/plugins/intel_npu/tools/single-image-test/main.cpp +++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp @@ -1200,7 +1200,8 @@ bool computeRRMSE(const ov::Tensor& output, const ov::Tensor& reference) { double rrmseLoss = sqrt(error / sum); - std::cout << "RRMSE loss : " << rrmseLoss << " RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl; + std::cout << "RRMSE loss : " << std::fixed << std::setprecision(4) << rrmseLoss + << " RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl; return rrmseLoss <= FLAGS_rrmse_loss_threshold; } @@ -1267,7 +1268,8 @@ bool computeNRMSE(const ov::Tensor& output, const ov::Tensor& reference) { double nrmseLoss = sqrt(error / size) / std::max(0.001f, std::max(maxOutput - minOutput, maxReference - minReference)); - std::cout << "NRMSE loss : " << nrmseLoss << " NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl; + std::cout << "NRMSE loss : " << std::fixed << std::setprecision(4) << nrmseLoss + << " NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl; return nrmseLoss <= FLAGS_nrmse_loss_threshold; } @@ -1319,7 +1321,7 @@ bool testPSNR(const TensorMap& outputs, const TensorMap& references, const int d auto result = utils::runPSNRMetric(actOutput, refOutput, dstHeight, dstWidth, scaleBorder, normalizedImage); - if (std::fabs(result - FLAGS_psnr_reference) > FLAGS_psnr_tolerance) { + if (FLAGS_psnr_reference - result > FLAGS_psnr_tolerance) { std::cout << "Absolute difference between actual value " << result << " and reference value " << FLAGS_psnr_reference << " larger then tolerance " << FLAGS_psnr_tolerance << std::endl; return false; diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py index 1cf458500bcc71..e55a86f279de21 100644 --- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py +++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py @@ -4,6 +4,8 @@ import numpy as np import pytest import torch +from packaging import version + from pytorch_layer_test_class import PytorchLayerTest, skip_if_export @@ -69,10 +71,12 @@ def forward_not_out(self, tensor_a, out): ) @pytest.mark.parametrize("out", [False, skip_if_export(True)]) def test_bitwise_mixed_dtypes( - self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version + self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version ): if ie_device == "GPU" and (lhs_dtype != "bool" or rhs_dtype != "bool"): pytest.xfail(reason="bitwise ops are not supported on GPU") + if out and version.parse(np.__version__) >= version.parse("2.0.0"): + pytest.xfail(reason="CVS-154082: incorrect handling out type") self._test( *self.create_model(op_type, out), ie_device, diff --git a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py index f0f9085d32ba2f..e982867c9ac08d 100644 --- a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py +++ b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py @@ -6,6 +6,7 @@ import tensorflow as tf from common.tf_layer_test_class import CommonTFLayerTest +rng = np.random.default_rng(62362) class TestExpandDims(CommonTFLayerTest): def _prepare_input(self, inputs_info): @@ -40,3 +41,54 @@ def test_expand_dims_basic(self, params, ie_device, precision, ir_version, temp_ self._test(*self.create_expand_dims_net(**params), ie_device, precision, ir_version, temp_dir=temp_dir, use_legacy_frontend=use_legacy_frontend) + + +class TestExpandDimsComplex(CommonTFLayerTest): + def _prepare_input(self, inputs_info): + # generate elements so that the input tensor may contain repeating elements + assert 'param_real:0' in inputs_info + assert 'param_imag:0' in inputs_info + + input_shape = inputs_info['param_real:0'] + + inputs_data = {} + inputs_data['param_real:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32) + inputs_data['param_imag:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32) + + return inputs_data + + def create_expand_dims_complex_net(self, axis_dtype, input_shape, axis): + tf.compat.v1.reset_default_graph() + with tf.compat.v1.Session() as sess: + param_real = tf.compat.v1.placeholder(np.float32, input_shape, 'param_real') + param_imag = tf.compat.v1.placeholder(np.float32, input_shape, 'param_imag') + + complex = tf.raw_ops.Complex(real=param_real, imag=param_imag) + + axis = tf.constant(axis, dtype=axis_dtype) + + result = tf.raw_ops.ExpandDims(input=complex, axis=axis) + + tf.raw_ops.Real(input=result) + tf.raw_ops.Imag(input=result) + + tf.compat.v1.global_variables_initializer() + tf_net = sess.graph_def + + return tf_net, None + + test_basic = [ + dict(input_shape=[], axis=0), + dict(input_shape=[2, 3], axis=1), + dict(input_shape=[2, 3, 4], axis=-1), + dict(input_shape=[2, 6, 5], axis=-2), + ] + + @pytest.mark.parametrize("axis_dtype", [np.int32, np.int64]) + @pytest.mark.parametrize("op_args", test_basic) + @pytest.mark.nightly + @pytest.mark.precommit + def test_expand_dims_basic_complex(self, axis_dtype, op_args, ie_device, precision, ir_version, temp_dir, use_legacy_frontend): + self._test(*self.create_expand_dims_complex_net(axis_dtype, **op_args), + ie_device, precision, ir_version, temp_dir=temp_dir, + use_legacy_frontend=use_legacy_frontend) diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch index b82e0c76409057..0d5ac61903b104 100644 --- a/tests/requirements_pytorch +++ b/tests/requirements_pytorch @@ -1,10 +1,14 @@ +# test ovc with NumPy 2.x on Ubuntu 24 with default Python 3.12 +# test against NumPy 1.x with older Python versions # optimum still requires numpy<2.0.0 -numpy==1.26.4 +numpy==1.26.4; python_version < "3.12" +numpy==2.1.1; python_version >= "3.12" torch==2.4.1; platform_system != "Darwin" or platform_machine != "x86_64" -torch==2.2.0; platform_system == "Darwin" and platform_machine == "x86_64" +torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64" --extra-index-url https://download.pytorch.org/whl/cpu -torchvision==0.19.1 +torchvision==0.19.1; platform_system != "Darwin" or platform_machine != "x86_64" +torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64" # transformers 4.45.1 is available # but optimum still requires <4.45.0 transformers==4.44.2 @@ -13,22 +17,22 @@ pytest-html==4.1.1 pytest-xdist[psutil]==3.6.1 defusedxml==0.7.1 -auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" +auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12" av==13.0.0 -basicsr==1.4.2 +basicsr==1.4.2; python_version < "3.12" datasets==3.0.1 easyocr==1.7.2 -facexlib==0.3.0 -librosa==0.10.2 -optimum==1.22.0 +facexlib==0.3.0; python_version < "3.12" +librosa==0.10.2; python_version < "3.12" +optimum==1.22.0; python_version < "3.12" packaging==24.1 pandas==2.2.3 protobuf==5.28.2 -pyctcdecode==0.5.0 +pyctcdecode==0.5.0; python_version < "3.12" sacremoses==0.1.1 sentencepiece==0.2.0 soundfile==0.12.1 -super-image==0.1.7 +super-image==0.1.7; python_version < "3.12" timm==1.0.8 torchaudio==2.4.1 wheel==0.44.0 @@ -36,7 +40,7 @@ PyYAML==6.0.2 kornia==0.7.3 # use latest released version once it's available -git+https://github.com/huggingface/optimum-intel.git@main +git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12" # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer hf_transfer==0.1.8 diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow index 9d025397ed1fbd..6042eb8a46a9c3 100644 --- a/tests/requirements_tensorflow +++ b/tests/requirements_tensorflow @@ -4,7 +4,8 @@ pytest==7.0.1 pytest-xdist[psutil]==3.6.1 pytest-html==4.1.1 transformers==4.45.1 -tensorflow==2.17.0 +tensorflow==2.17.0; platform_system != "Darwin" or platform_machine != "x86_64" +tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64" # tensorflow-text is not available for both Windows and ARM platforms tensorflow-text==2.17.0; platform_system == "Linux" and platform_machine == "x86_64" tensorflow-hub==0.16.1