diff --git a/.github/workflows/job_pytorch_layer_tests.yml b/.github/workflows/job_pytorch_layer_tests.yml
index 88b41f983f7094..50942cf331ab72 100644
--- a/.github/workflows/job_pytorch_layer_tests.yml
+++ b/.github/workflows/job_pytorch_layer_tests.yml
@@ -7,10 +7,6 @@ on:
         description: 'Machine on which the tests would run'
         type: string
         required: true
-      shell:
-        description: "shell to override the default shell settings in the runner's operating system."
-        type: string
-        required: true
       container:
         description: 'JSON to be converted to the value of the "container" configuration for the job'
         type: string
@@ -20,12 +16,15 @@ on:
         description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
         type: string
         required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
 
 permissions: read-all
 
 env:
   PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
 
 jobs:
   PyTorch_Layer_Tests:
@@ -35,7 +34,7 @@ jobs:
     container: ${{ fromJSON(inputs.container) }}
     defaults:
       run:
-        shell: ${{ inputs.shell }}
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       OPENVINO_REPO: ${{ github.workspace }}/openvino
@@ -55,12 +54,6 @@ jobs:
           name: openvino_tests
           path: ${{ env.INSTALL_TEST_DIR }}
 
-      - name: Download OpenVINO tokenizers extension
-        uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
-        with:
-          name: openvino_tokenizers_wheel
-          path: ${{ env.INSTALL_DIR }}
-
       # Needed as ${{ github.workspace }} is not working correctly when using Docker
       - name: Setup Variables
         if: runner.os != 'Windows'
@@ -98,10 +91,10 @@ jobs:
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
+      - name: Setup Python ${{ inputs.python-version }}
         uses: ./openvino/.github/actions/setup_python
         with:
-          version: ${{ env.PYTHON_VERSION }}
+          version: ${{ inputs.python-version }}
           pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
           should-setup-pip-paths: ${{ runner.os == 'Linux' }}
           self-hosted-runner: ${{ runner.os == 'Linux' }}
@@ -112,9 +105,6 @@ jobs:
           # Install the core OV wheel
           python3 -m pip install ${INSTALL_DIR}/tools/openvino-*.whl
 
-          # Install the core OV Tokenizers wheel
-          python3 -m pip install ${INSTALL_DIR}/openvino_tokenizers-*.whl
-
       - name: Install OpenVINO Python wheels (Windows)
         if: runner.os == 'Windows'
         run: |
@@ -122,10 +112,6 @@ jobs:
           $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }}\tools -Filter openvino-*.whl | % { $_.FullName }
           python3 -m pip install "$ovCoreWheelPath"
 
-          # Find and install the core OV Tokenizers wheel
-          $ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }} -Filter openvino_tokenizers-*.whl | % { $_.FullName }
-          python3 -m pip install "$ovCoreWheelPath"
-
       - name: Install Pytorch Layer tests dependencies
         run: |
           # pytorch test requirements
@@ -133,22 +119,25 @@ jobs:
 
       - name: PyTorch Layer Tests
         if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196
-        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+        # due to CVS-152795, parallel run is not possible on Windows
+        run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
 
       - name: PyTorch torch.export Layer Tests
-        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287
+        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
-          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
+          python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
           TEST_DEVICE: CPU
           TEST_PRECISION: FP32
           PYTORCH_TRACING_MODE: EXPORT
+          PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}
 
       - name: PyTorch torch.compile TORCHFX Layer Tests
-        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' }} # Ticket: 126287
+        if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
         run: |
           python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
         env:
diff --git a/.github/workflows/job_tensorflow_layer_tests.yml b/.github/workflows/job_tensorflow_layer_tests.yml
index 0801010b86bde3..e8d7b51e14c02f 100644
--- a/.github/workflows/job_tensorflow_layer_tests.yml
+++ b/.github/workflows/job_tensorflow_layer_tests.yml
@@ -7,10 +7,6 @@ on:
         description: 'Machine on which the tests would run'
         type: string
         required: true
-      shell:
-        description: "shell to override the default shell settings in the runner's operating system."
-        type: string
-        required: true
       container:
         description: 'JSON to be converted to the value of the "container" configuration for the job'
         type: string
@@ -20,12 +16,15 @@ on:
         description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
         type: string
         required: true
+      python-version:
+        description: 'Python version to setup. E.g., "3.11"'
+        type: string
+        required: true
 
 permissions: read-all
 
 env:
   PIP_CACHE_PATH: /mount/caches/pip/linux
-  PYTHON_VERSION: '3.11'
 
 jobs:
   TensorFlow_Layer_Tests:
@@ -35,7 +34,7 @@ jobs:
     container: ${{ fromJSON(inputs.container) }}
     defaults:
       run:
-        shell: ${{ inputs.shell }}
+        shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
     env:
       DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
       OPENVINO_REPO: ${{ github.workspace }}/openvino
@@ -98,10 +97,10 @@ jobs:
           sparse-checkout-cone-mode: false
           path: 'openvino'
 
-      - name: Setup Python ${{ env.PYTHON_VERSION }}
+      - name: Setup Python ${{ inputs.python-version }}
         uses: ./openvino/.github/actions/setup_python
         with:
-          version: ${{ env.PYTHON_VERSION }}
+          version: ${{ inputs.python-version }}
           pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
           should-setup-pip-paths: ${{ runner.os == 'Linux' }}
           self-hosted-runner: ${{ runner.os == 'Linux' }}
diff --git a/.github/workflows/linux_arm64.yml b/.github/workflows/linux_arm64.yml
index 3506ca49846f45..e4e608f3aca6d4 100644
--- a/.github/workflows/linux_arm64.yml
+++ b/.github/workflows/linux_arm64.yml
@@ -173,19 +173,19 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Docker, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-linux-16-cores-arm'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/mac.yml b/.github/workflows/mac.yml
index da3224fa483ad1..20db9de1776015 100644
--- a/.github/workflows/mac.yml
+++ b/.github/workflows/mac.yml
@@ -276,17 +276,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'macos-13'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'macos-13'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/mac_arm64.yml b/.github/workflows/mac_arm64.yml
index 331afc7266cd6a..a38179f71fb60c 100644
--- a/.github/workflows/mac_arm64.yml
+++ b/.github/workflows/mac_arm64.yml
@@ -275,17 +275,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'macos-13-xlarge'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'macos-13-xlarge'
-      shell: bash
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/ubuntu_22.yml b/.github/workflows/ubuntu_22.yml
index 8f461391f20a9f..2c20e5136cfc4e 100644
--- a/.github/workflows/ubuntu_22.yml
+++ b/.github/workflows/ubuntu_22.yml
@@ -305,19 +305,19 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Docker, Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-linux-4-cores-16gb'
-      shell: bash
       container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CPU_Functional_Tests:
     name: CPU functional tests
diff --git a/.github/workflows/ubuntu_24.yml b/.github/workflows/ubuntu_24.yml
index 6409b417a0731b..295a4dd0e2c61a 100644
--- a/.github/workflows/ubuntu_24.yml
+++ b/.github/workflows/ubuntu_24.yml
@@ -133,6 +133,16 @@ jobs:
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
       python-version: '3.12'
 
+  Pytorch_Layer_Tests:
+    name: Pytorch Layer Tests
+    needs: [ Docker, Build, Smart_CI ]
+    uses: ./.github/workflows/job_pytorch_layer_tests.yml
+    with:
+      runner: 'aks-linux-4-cores-16gb'
+      container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}'
+      affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.12'
+
   Overall_Status:
     name: ci/gha_overall_status_ubuntu_24
     needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests]
diff --git a/.github/workflows/windows_vs2019_release.yml b/.github/workflows/windows_vs2019_release.yml
index 39cf2161525513..122fcc3c1c5021 100644
--- a/.github/workflows/windows_vs2019_release.yml
+++ b/.github/workflows/windows_vs2019_release.yml
@@ -404,17 +404,17 @@ jobs:
     uses: ./.github/workflows/job_tensorflow_layer_tests.yml
     with:
       runner: 'aks-win-8-cores-16gb'
-      shell: pwsh
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   Pytorch_Layer_Tests:
     name: Pytorch Layer Tests
-    needs: [ Build, Smart_CI, Openvino_tokenizers ]
+    needs: [ Build, Smart_CI ]
     uses: ./.github/workflows/job_pytorch_layer_tests.yml
     with:
       runner: 'aks-win-8-cores-16gb'
-      shell: pwsh
       affected-components: ${{ needs.smart_ci.outputs.affected_components }}
+      python-version: '3.11'
 
   CXX_Unit_Tests:
     name: C++ unit tests
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
index d9f5e25c332984..7b135fa7ff0b14 100644
--- a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device.rst
@@ -11,6 +11,7 @@ NPU Device
    :hidden:
 
    npu-device/remote-tensor-api-npu-plugin
+   npu-device/batching-on-npu-plugin
 
 
 The Neural Processing Unit is a low-power hardware solution, introduced with the
diff --git a/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst
new file mode 100644
index 00000000000000..379822e327c8cd
--- /dev/null
+++ b/docs/articles_en/openvino-workflow/running-inference/inference-devices-and-modes/npu-device/batching-on-npu-plugin.rst
@@ -0,0 +1,37 @@
+NPU Plugin Batching 
+===============================
+
+
+.. meta::
+   :description: OpenVINO™ NPU plugin supports batching
+                 either by executing concurrent inferences or by
+                 relying on native compiler support for batching.
+
+OpenVINO™ NPU plugin supports batching either by executing concurrent inferences or by relying on native compiler support for batching.
+
+First, the NPU plugin checks if the following conditions are met:
+
+* The batch size is on the first axis.
+* All inputs and outputs have the same batch size.
+* The model does not contain states.
+
+**If the conditions are met**, the NPU plugin attempts to compile and execute the original model with batch_size forced to 1. This approach is due to current compiler limitations and ongoing work to improve performance for batch_size greater than one.
+If the compilation is successful, the plugin detects a difference in batch size between the original model layout (with a batch size set to N)
+and the transformed/compiled layout (with a batch size set to 1). Then it executes the following steps:
+
+1. Internally constructs multiple command lists, one for each input.
+2. Executes each command list for the proper offsets of input/output buffers.
+3. Notifies the user of the completion of the inference request after all command lists have been executed.
+
+This concurrency-based batching mode is transparent to the application. A single inference request handles all inputs from the batch.
+While performance may be lower compared to regular batching (based on native compiler support), this mode provides basic batching functionality for use either with older drivers
+or when the model cannot yet be compiled with a batch size larger than one.
+
+**If the conditions are not met**, the NPU plugin tries to compile and execute the original model with the given
+batch_size to N as any other regular model.
+
+.. note::
+
+   With future performance improvements and support for compiling multiple models with a batch size larger 
+   than one, the default order will change. NPU will try first to compile and execute the original model with the 
+   given batch size and fall back to concurrent batching if compilation fails.
diff --git a/src/bindings/js/node/lib/addon.ts b/src/bindings/js/node/lib/addon.ts
index 060af2cfec92e8..24c9d780aa9f7e 100644
--- a/src/bindings/js/node/lib/addon.ts
+++ b/src/bindings/js/node/lib/addon.ts
@@ -21,6 +21,8 @@ type elementTypeString =
   | 'f32'
   | 'string';
 
+type OVAny = string | number | boolean;
+
 /**
  * Core represents an OpenVINO runtime Core entity.
  *
@@ -48,7 +50,7 @@ interface Core {
   compileModel(
     model: Model,
     deviceName: string,
-    config?: { [propertyName: string]: string },
+    config?: Record<string, OVAny>,
   ): Promise<CompiledModel>;
   /**
    * Asynchronously reads a model and creates a compiled model
@@ -67,7 +69,7 @@ interface Core {
   compileModel(
     modelPath: string,
     deviceName: string,
-    config?: { [propertyName: string]: string },
+    config?: Record<string, OVAny>,
   ): Promise<CompiledModel>;
   /**
    * A synchronous version of {@link Core.compileModel}.
@@ -76,7 +78,7 @@ interface Core {
   compileModelSync(
     model: Model,
     deviceName: string,
-    config?: { [propertyName: string]: string },
+    config?: Record<string, OVAny>,
   ): CompiledModel;
   /**
    * A synchronous version of {@link Core.compileModel}.
@@ -85,7 +87,7 @@ interface Core {
   compileModelSync(
     modelPath: string,
     deviceName: string,
-    config?: { [propertyName: string]: string },
+    config?: Record<string, OVAny>,
   ): CompiledModel;
   /**
    * It returns a list of available inference devices.
@@ -101,7 +103,7 @@ interface Core {
    * It gets the properties dedicated to device behaviour.
    * @param propertyName A property name.
    */
-  getProperty(propertyName: string): string | number | boolean;
+  getProperty(propertyName: string): OVAny;
 
   /**
    * It gets the properties dedicated to device behaviour.
@@ -111,7 +113,7 @@ interface Core {
   getProperty(
     deviceName: string,
     propertyName: string,
-  ): string | number | boolean;
+  ): OVAny;
   /**
    * It returns information on the version of device plugins.
    * @param deviceName A device name to identify a plugin.
@@ -135,7 +137,7 @@ interface Core {
   importModel(
     modelStream: Buffer,
     device: string,
-    config?: { [key: string]: string | number | boolean },
+    config?: Record<string, OVAny>,
   ): Promise<CompiledModel>;
   /**
    * A synchronous version of {@link Core.importModel}.
@@ -144,7 +146,7 @@ interface Core {
   importModelSync(
     modelStream: Buffer,
     device: string,
-    config?: { [key: string]: string | number | boolean },
+    config?: Record<string, OVAny>,
   ): CompiledModel;
   /**
    * It reads models from the IR / ONNX / PDPD / TF and TFLite formats.
@@ -197,16 +199,13 @@ interface Core {
    * It sets the properties.
    * @param properties An object with the property name - property value pairs.
    */
-  setProperty(properties: { [key: string]: string | number | boolean }): void;
+  setProperty(properties: Record<string, OVAny>): void;
   /**
    * It sets the properties for a device.
    * @param deviceName The name of a device.
    * @param properties An object with the property name - property value pairs.
    */
-  setProperty(
-    deviceName: string,
-    properties: { [key: string]: string | number | boolean },
-  ): void;
+  setProperty(deviceName: string, properties: Record<string, OVAny>): void;
   /**
    * It queries the device if it supports specified model with the specified
    * properties.
@@ -218,8 +217,8 @@ interface Core {
   queryModel(
     model: Model,
     deviceName: string,
-    properties?: {[key: string]: string | number | boolean},
-  ): {[key: string]: string | number | boolean};
+    properties?: Record<string, OVAny>,
+  ): { [key: string]: string };
 }
 interface CoreConstructor {
   new (): Core;
@@ -325,7 +324,7 @@ interface CompiledModel {
    * @param propertyName A string to get the property value.
    * @returns The property value.
    */
-  getProperty(propertyName: string): string | number | boolean;
+  getProperty(propertyName: string): OVAny;
   /**
    * It creates an inference request object used to infer the compiled model.
    * @return {InferRequest}
@@ -380,9 +379,7 @@ interface CompiledModel {
    * @param property An object with the key-value pairs.
    * (property name, property value)
    */
-  setProperty(properties: {
-    [propertyName: string]: string | number | boolean;
-  }): void;
+  setProperty(properties: Record<string, OVAny>): void;
 }
 
 /**
diff --git a/src/bindings/js/node/tests/unit/core.test.js b/src/bindings/js/node/tests/unit/core.test.js
index 6cf431a38b5030..f62adda9f90f9c 100644
--- a/src/bindings/js/node/tests/unit/core.test.js
+++ b/src/bindings/js/node/tests/unit/core.test.js
@@ -12,11 +12,11 @@ describe('ov.Core tests', () => {
   before(async () => {
     await isModelAvailable(testModels.testModelFP32);
   });
- 
+
   beforeEach(() => {
     core = new ov.Core();
   });
-  
+
   it('Core.setProperty()', () => {
     const tmpDir = '/tmp';
 
@@ -83,29 +83,29 @@ describe('ov.Core tests', () => {
   it('Core.queryModel() with empty parameters should throw an error', () => {
     assert.throws(
       () => core.queryModel().then(),
-      /'queryModel' method called with incorrect parameters./
-    )
+      /'queryModel' method called with incorrect parameters./,
+    );
   });
 
   it('Core.queryModel() with less arguments should throw an error', () => {
     assert.throws(
-      () => core.queryModel("Unexpected Argument").then(),
-      /'queryModel' method called with incorrect parameters./
-    )
+      () => core.queryModel('Unexpected Argument').then(),
+      /'queryModel' method called with incorrect parameters./,
+    );
   });
 
   it('Core.queryModel() with incorrect arguments should throw an error', () => {
     const model = core.readModelSync(getModelPath().xml);
     assert.throws(
-      () => core.queryModel(model, "arg1", "arg2").then(),
-      /'queryModel' method called with incorrect parameters./
-    )
+      () => core.queryModel(model, 'arg1', 'arg2').then(),
+      /'queryModel' method called with incorrect parameters./,
+    );
   });
 
   it('Core.queryModel() should have device in the result values', () => {
     const model = core.readModelSync(getModelPath().xml);
     const device = 'CPU';
-    const query_model = core.queryModel(model, device);
-    assert(Object.values(query_model).includes(device));
+    const queryModel = core.queryModel(model, device);
+    assert(Object.values(queryModel).includes(device));
   });
 });
diff --git a/src/core/src/bound_evaluate.cpp b/src/core/src/bound_evaluate.cpp
index 22b91a15e3dcee..f1c6a0601eea90 100644
--- a/src/core/src/bound_evaluate.cpp
+++ b/src/core/src/bound_evaluate.cpp
@@ -494,14 +494,12 @@ bool ov::interval_bound_evaluator(const Node* node,
             vector_of_output_variants.emplace_back(output.get_element_type(), output.get_shape());
         }
 
-        node->evaluate(vector_of_output_variants, input_variant);
+        if (!node->evaluate(vector_of_output_variants, input_variant)) {
+            return false;
+        };
 
         TensorVector vector_of_unsqueezed_output_variants;
         for (const auto& output : vector_of_output_variants) {
-            if (!output) {
-                return false;
-            }
-
             auto unsqueezed_shape = output.get_shape();
             unsqueezed_shape.insert(unsqueezed_shape.begin(), 1);
 
diff --git a/src/frontends/tensorflow_common/src/op/expand_dims.cpp b/src/frontends/tensorflow_common/src/op/expand_dims.cpp
index b3b37ad38cc302..a40e5c9b1bc6df 100644
--- a/src/frontends/tensorflow_common/src/op/expand_dims.cpp
+++ b/src/frontends/tensorflow_common/src/op/expand_dims.cpp
@@ -3,7 +3,13 @@
 //
 
 #include "common_op_table.hpp"
+#include "helper_ops/complex_type_mark.hpp"
+#include "openvino/op/constant.hpp"
+#include "openvino/op/less.hpp"
+#include "openvino/op/select.hpp"
+#include "openvino/op/subtract.hpp"
 #include "openvino/op/unsqueeze.hpp"
+#include "utils.hpp"
 
 using namespace std;
 using namespace ov::op;
@@ -14,9 +20,31 @@ namespace tensorflow {
 namespace op {
 
 OutputVector translate_expand_dims_op(const NodeContext& node) {
-    default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"});
+    default_op_checks(node, 2, {"ExpandDims", "EXPAND_DIMS"}, true);
     auto input = node.get_input(0);
     auto axis = node.get_input(1);
+    auto complex_type_mark = as_type_ptr<ComplexTypeMark>(input.get_node_shared_ptr());
+
+    if (complex_type_mark) {
+        element::Type complex_part_type = complex_type_mark->get_complex_part_type();
+        input = complex_type_mark->input_value(0);
+
+        auto const_zero = create_same_type_const_scalar<int32_t>(axis, 0);
+
+        auto is_axis_neg = make_shared<v1::Less>(axis, const_zero);
+
+        auto const_one = create_same_type_const_scalar<int32_t>(axis, 1);
+        auto axis_min_one = make_shared<v1::Subtract>(axis, const_one);
+
+        auto new_axis = make_shared<v1::Select>(is_axis_neg, axis_min_one, axis);
+
+        auto unsqueeze = make_shared<v0::Unsqueeze>(input, new_axis);
+
+        set_node_name(node.get_name(), unsqueeze);
+        auto complex_result = make_shared<ComplexTypeMark>(unsqueeze, complex_part_type);
+        return {complex_result};
+    }
+
     auto unsqueeze = make_shared<v0::Unsqueeze>(input, axis);
     set_node_name(node.get_name(), unsqueeze);
     return {unsqueeze};
diff --git a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
index 71623f32843eac..63adae28ddabf3 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/graph/network.hpp
@@ -4,17 +4,15 @@
 
 #pragma once
 
-#include "openvino/runtime/threading/cpu_streams_executor.hpp"
+#include "openvino/runtime/threading/istreams_executor.hpp"
 
 #include "intel_gpu/graph/topology.hpp"
 #include "intel_gpu/graph/program.hpp"
 #include "intel_gpu/graph/serialization/binary_buffer.hpp"
-#include "intel_gpu/runtime/compounds.hpp"
 #include "intel_gpu/runtime/memory.hpp"
 #include "intel_gpu/runtime/engine.hpp"
 #include "intel_gpu/runtime/event.hpp"
 #include "intel_gpu/runtime/stream.hpp"
-#include "intel_gpu/runtime/lru_cache.hpp"
 #include "intel_gpu/runtime/shape_predictor.hpp"
 #include "intel_gpu/plugin/variable_state.hpp"
 
@@ -211,7 +209,7 @@ struct network {
     bool is_dynamic() const { return _is_dynamic; }
     size_t get_weights_cache_capacity() const { return _weights_cache_capacity; }
 
-    memory_pool& get_memory_pool() {
+    memory_pool& get_memory_pool() const {
         return *_memory_pool;
     }
 
@@ -284,7 +282,9 @@ struct network {
     void dump_memory_pool(std::string dump_path, int64_t curr_iter);
 
 #ifdef GPU_DEBUG_CONFIG
-    int64_t iteration = 0;
+    mutable int64_t iteration = 0;
+    friend class NetworkDebugHelper;
+    friend class NodeDebugHelper;
 #endif
 };
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.cpp b/src/plugins/intel_gpu/src/graph/debug_helper.cpp
new file mode 100644
index 00000000000000..7f7071e704683e
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/debug_helper.cpp
@@ -0,0 +1,526 @@
+// Copyright (C) 2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "debug_helper.hpp"
+#include "openvino/util/file_util.hpp"
+
+#ifdef GPU_DEBUG_CONFIG
+
+#include "to_string_utils.h"
+#include "loop_inst.h"
+#include "condition_inst.h"
+#include "program_dump_graph.h"
+
+#include <iomanip>
+#include <fstream>
+#include <sys/stat.h>
+
+namespace cldnn {
+
+namespace {
+
+float convert_element(int64_t i) { return static_cast<float>(i); }
+float convert_element(int32_t i) { return static_cast<float>(i); }
+
+float convert_element(float f) { return f; }
+
+float convert_element(ov::float16 h) { return static_cast<float>(h); }
+
+size_t get_x_pitch(const layout& layout) {
+    try {
+        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
+        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
+        auto x0 = layout.get_linear_offset(tensor_x0);
+        auto x1 = layout.get_linear_offset(tensor_x1);
+        return (x1 - x0);
+    } catch (...) {
+        // When spatial size of x=0, x_pitch is meaningless
+        return 0;
+    }
+}
+
+template <class T>
+void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
+    auto&& size = mem->get_layout().get_tensor();
+
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+                    << ", original shape: " << size.to_string() << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    }
+
+    if (size.count() == 0) {
+        file_stream << "Empty buffer" << std::endl;
+        return;
+    }
+
+    mem_lock<T, mem_lock_type::read> lock(mem, stream);
+    auto mem_ptr = lock.data();
+    auto x_pitch = get_x_pitch(mem->get_layout());
+    std::stringstream buffer;
+
+    if (!dump_raw) {
+        for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
+            for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
+                for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
+                    for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
+                        for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
+                            for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
+                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
+                                size_t input_it = mem->get_layout().get_linear_offset(t);
+
+                                for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
+                                    buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
+                                }
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    } else {
+        for (size_t i = 0; i < lock.size(); ++i) {
+            buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
+        }
+    }
+    file_stream << buffer.str();
+}
+
+void unpack(cldnn::data_types type, uint8_t input, int8_t &v0, int8_t &v1) {
+    if (type == cldnn::data_types::i4) {
+        char s_bit = (input & 0x08);
+        char mask = s_bit > 0 ? 0xF0 : 0x00;
+        v0 = (input & 0x0F) | mask;
+
+        input >>= 4;
+        s_bit = (input & 0x08);
+        mask = s_bit > 0 ? 0xF0 : 0x00;
+        v1 = (input & 0x0F) | mask;
+    } else if (type == cldnn::data_types::u4) {
+        v0 = input & 0x0F;
+        v1 = input >> 4;
+    } else {
+        OPENVINO_ASSERT(false, "not supported unpacking");
+    }
+}
+
+void dump_i4u4(cldnn::data_types type, memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
+    auto&& size = mem->get_layout().get_tensor();
+
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
+    tensor tmp_size(size);
+    tmp_size.batch[0] = batch_size;
+    if (tmp_size == size) {
+        file_stream << "shape: " << size.to_string() << " ";
+        file_stream << "(count: " << size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    } else {
+        file_stream << "shape: " << tmp_size.to_string() << " ";
+        file_stream << "(count: " << tmp_size.count()
+                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
+                    << ", original shape: " << size.to_string() << ")"
+                    << (dump_raw ? " raw data" : "") << std::endl;
+    }
+
+    if (size.count() == 0) {
+        file_stream << "Empty buffer" << std::endl;
+        return;
+    }
+
+    mem_lock<uint8_t, mem_lock_type::read> lock(mem, stream);
+    auto mem_ptr = lock.data();
+    std::stringstream buffer;
+
+    if (dump_raw) {
+        for (size_t i = 0; i < lock.size(); ++i) {
+            int8_t v0, v1;
+            unpack(type, mem_ptr[i], v0, v1);
+            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v0) << std::endl;
+            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v1) << std::endl;
+        }
+    } else {
+        std::cout << __func__ << " supports raw dump only" << std::endl;
+    }
+    file_stream << buffer.str();
+}
+
+void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std::string layerName, bool dump_raw) {
+    std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    std::string filename = debug_config->get_name_for_dump(layerName);
+    filename = debug_config->dump_layers_path + filename + ".txt";
+    std::ofstream file_stream(filename);
+    if (!mem) {
+        file_stream << "Empty" << std::endl;
+        return;
+    }
+
+    // Reinterpret buffer to represent actual data layout
+    auto actual_mem = mem->get_engine()->reinterpret_buffer(*mem, data_layout);
+
+    auto mem_dt = actual_mem->get_layout().data_type;
+    if (mem_dt == cldnn::data_types::f32)
+        dump<float>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::f16)
+        dump<ov::float16>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i64)
+        dump<int64_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i32)
+        dump<int32_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i8)
+        dump<int8_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::u8)
+        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::u8)
+        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
+    else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4)
+        dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw);
+    else
+        std::cout << "Dump for this data type is not supported: " << dt_to_str(mem_dt) << std::endl;
+}
+
+}  // namespace
+
+static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) {
+    std::string filename;
+    std::string data_type = ov::element::Type(layout.data_type).get_type_name();
+    std::string format = layout.format.to_string();
+    std::string tensor;
+    auto dims = layout.get_dims();
+    for (size_t r = 0 ; r < layout.get_rank() ; r++) {
+        tensor += ("_" + to_string(dims[r]));
+    }
+
+#ifdef GPU_DEBUG_CONFIG
+    GPU_DEBUG_GET_INSTANCE(debug_config);
+    std::string layer_name = debug_config->get_name_for_dump(name);
+    filename = debug_config->dump_layers_path + layer_name
+                + "__" + data_type + "_" + tensor + "__" + format + ".bin";
+#endif
+    return filename;
+}
+
+NodeDebugHelper::NodeDebugHelper(const primitive_inst& inst)
+    : m_inst(inst)
+    , m_stream(inst.get_network().get_stream())
+    , m_network(inst.get_network())
+    , m_program(inst.get_network().get_program().get())
+    , m_iter(m_network.iteration) {
+    // Load binary dump for input layers
+    if (!debug_config->load_layers_raw_dump.empty()) {
+        const std::string layer_name = m_inst.id();
+        auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name);
+        if (!files.empty()) {
+            if (m_inst.is_input()) {
+                // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists
+                auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__");
+                OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer");
+
+                OPENVINO_ASSERT(files.size() == m_inst.outputs_memory_count(), "Mis-match dump file count");
+
+                for (size_t i = 0; i < m_inst.outputs_memory_count(); i++) {
+                    auto dump_file = files[0];
+                    if (files.size() > 1 || m_inst.outputs_memory_count() != 1) {
+                        std::string pattern = "_dst" + std::to_string(i) + "__";
+                        dump_file = debug_config->get_matched_from_filelist(files, pattern);
+                    }
+                    OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump");
+                    GPU_DEBUG_COUT << " Load binary dump : " << dump_file << " for " << layer_name << std::endl;
+
+                    std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
+                    OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
+
+                    auto output_mem = m_inst.output_memory_ptr(i);
+                    OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
+                                    + "\n Expected size : " + to_string(output_mem->size()) + ", Binary : " + to_string(bin.size()));
+
+                    output_mem->copy_from(m_stream, static_cast<void *>(&bin[0]), true);
+                }
+            } else {
+                auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__");
+                OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name);
+
+                // Loading input tensors for any layer
+                auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__");
+                OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name);
+
+                for (size_t i = 0; i < m_inst.dependencies().size(); i++) {
+                    auto dump_file = files[0];
+                    if (files.size() > 1 || m_inst.dependencies().size() != 1) {
+                        std::string pattern = "_src" + std::to_string(i) + "__";
+                        dump_file = debug_config->get_matched_from_filelist(files, pattern);
+                    }
+                    if (dump_file.length() == 0) {
+                        GPU_DEBUG_COUT  << " Skip loading for  input(" << i << ") of " << layer_name << std::endl;
+                        continue;
+                    }
+                    OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input");
+                    GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for input(" << i << ") of " << layer_name << std::endl;
+
+                    std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
+                    OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
+
+                    auto input_mem = m_inst.dep_memory_ptr(i);
+                    if (input_mem->size() != bin.size()) {
+                        std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
+                                    << "  " << input_mem->size() << " / " << bin.size() << std::endl;
+                        bin.resize(input_mem->size());
+                    }
+
+                    input_mem->copy_from(m_stream, static_cast<void *>(&bin[0]), true);
+                }
+            }
+        }
+    }
+
+    // Dump input buffers of 'inst'
+    if (debug_config->dump_layers_path.length() > 0) {
+        const std::string layer_name = inst.id();
+
+        if (debug_config->is_target_iteration(m_iter) &&
+            debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
+            std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
+            for (size_t i = 0; i < m_inst.dependencies().size(); i++) {
+                std::string name = get_file_prefix() + layer_name + "_src" + std::to_string(i);
+                auto input_mem = m_inst.dep_memory_ptr(i);
+                if (input_mem == nullptr) {
+                    GPU_DEBUG_COUT  << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl;
+                    continue;
+                }
+
+                auto dep = m_inst.dependencies().at(i);
+                auto input_layout = dep.first->get_output_layout(dep.second);
+                GPU_DEBUG_IF(debug_config->dump_layers_binary) {
+                    // Binary dump : raw
+                    auto filename = get_file_path_for_binary_dump(input_layout, name);
+
+                    mem_lock<char, mem_lock_type::read> lock(input_mem, m_stream);
+                    ov::util::save_binary(filename, lock.data(), input_mem->size());
+                    GPU_DEBUG_COUT  << " Dump layer src : " << layer_name << " to " << filename << std::endl;
+                    debug_str_for_bin_load += (filename + ",");
+                } else {
+                    log_memory_to_file(input_mem,
+                                       input_layout,
+                                       m_stream,
+                                       name,
+                                       debug_config->dump_layers_raw);
+                }
+            }
+
+            if (debug_config->dump_layers_binary && !inst.is_input()) {
+                debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
+                GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;
+            }
+        }
+    }
+}
+
+
+NodeDebugHelper::~NodeDebugHelper() {
+    // Dump output buffers of 'inst'
+    if (debug_config->dump_layers_path.length() > 0) {
+        m_stream.finish();
+        const std::string layer_name = m_inst.id();
+
+        GPU_DEBUG_IF(debug_config->is_target_iteration(m_iter) &&
+                    debug_config->is_layer_for_dumping(layer_name, m_inst.is_output(), m_inst.is_input())) {
+            std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\""
+                                                    + layer_name + ":";
+            for (size_t i = 0; i < m_inst.outputs_memory_count(); i++) {
+                std::string name = get_file_prefix() + "_dst" + std::to_string(i);
+                auto output_mem = m_inst.output_memory_ptr(i);
+                if (output_mem == nullptr) {
+                    GPU_DEBUG_COUT  << " output_mem is nullptr. Nothing to dump." << std::endl;
+                    continue;
+                }
+
+                GPU_DEBUG_IF(debug_config->dump_layers_binary) {
+                    // Binary dump : raw
+                    auto output_layout = m_inst.get_output_layout(i);
+                    auto filename = get_file_path_for_binary_dump(output_layout, name);
+
+                    mem_lock<char, mem_lock_type::read> lock(output_mem, m_stream);
+                    ov::util::save_binary(filename, lock.data(), output_mem->size());
+                    GPU_DEBUG_COUT  << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
+                    debug_str_for_bin_load += (filename + ",");
+                } else {
+                    // Text dump
+                    log_memory_to_file(output_mem, m_inst.get_output_layout(i), m_stream, name, debug_config->dump_layers_raw);
+                }
+            }
+
+            GPU_DEBUG_IF(debug_config->dump_layers_binary && m_inst.is_input()) {
+                debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
+                GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
+            }
+        }
+    }
+}
+
+NetworkDebugHelper::NetworkDebugHelper(const network& net)
+    : m_network(net)
+    , m_iter(net.iteration) {
+    auto net_id = m_network.get_id();
+    GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) {
+        auto& iters = debug_config->dump_memory_pool_iters;
+        if (iters.empty() || iters.find(m_iter) != iters.end()) {
+            GPU_DEBUG_COUT << "============================================================================" << std::endl;
+            GPU_DEBUG_COUT << "Start network execution (net_id : " << net_id << ", iter :" << m_iter << ")" << std::endl;
+            if (m_iter == 0 && net_id > 0) {
+                dump_memory_pool(debug_config->dump_memory_pool_path, m_iter);
+                GPU_DEBUG_COUT << "============================================================================" << std::endl;
+            }
+        }
+    } else {
+        GPU_DEBUG_TRACE << "============================================================================" << std::endl;
+        GPU_DEBUG_TRACE << "Start network execution (net_id : " << net_id << ", iter :" << m_iter << ")" << std::endl;
+    }
+
+    if (debug_config->list_layers == 1) {
+        for (auto& inst : m_network._exec_order) {
+            GPU_DEBUG_COUT << inst->id() << std::endl;
+            if (inst->get_node().is_type<loop>()) {
+                auto& loop_node = inst->get_node().as<loop>();
+                for (auto& prim : loop_node.get_body_program()->get_processing_order()) {
+                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
+                }
+            } else if (inst->get_node().is_type<condition>()) {
+                auto& cond_node = inst->get_node().as<condition>();
+                GPU_DEBUG_COUT << "* Branch_True" << std::endl;
+                for (auto& prim : cond_node.get_branch_true().inner_program->get_processing_order()) {
+                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
+                }
+                GPU_DEBUG_COUT << "* Branch_False" << std::endl;
+                for (auto& prim : cond_node.get_branch_false().inner_program->get_processing_order()) {
+                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
+                }
+            }
+        }
+
+        if (!m_network.is_internal())
+            exit(0);
+    }
+}
+
+NetworkDebugHelper::~NetworkDebugHelper() {
+    auto prog = m_network.get_program().get();
+    auto net_id = m_network.get_id();
+    // print '-data_shape' option for benchmark_app
+    if (debug_config->print_input_data_shapes == 1) {
+        std::stringstream data_shape_str;
+        auto add_string = [&data_shape_str](std::string str) {
+            data_shape_str << ((data_shape_str.rdbuf()->in_avail() == 0) ? " -data_shape " : ",") << str;
+        };
+
+        for (auto& inst : m_network._exec_order) {
+            auto name = inst->id();
+            auto pos = name.find(':');
+            auto type = name.substr(0, pos);
+            name.erase(0, pos + 1);
+            if (inst->is_input() && type == "parameter") {
+                add_string(name + inst->get_output_layout().get_partial_shape().to_string());
+            }
+        }
+
+        GPU_DEBUG_COUT << "[program:" << std::setw(2) << ((prog != nullptr) ? prog->get_id() : 0)
+                       << "|network:" << std::setw(2) << net_id << "|iter:" << std::setw(4) << m_iter <<  "] benchmark_app cmd: "
+                       << data_shape_str.str() << std::endl;
+    }
+
+    if (!debug_config->dump_graphs.empty() && debug_config->is_target_iteration(m_iter)) {
+        auto get_fixed_str = [](int value, int length = 2) -> std::string {
+            std::ostringstream ss;
+            ss << std::setw(length) << std::setfill('0') << std::to_string(value);
+            return ss.str();
+        };
+        std::string path = get_dir_path(m_network.get_config());
+        if (!path.empty()) {
+            std::ofstream ofs(path + "cldnn_program_exec_p" + get_fixed_str(prog->get_id()) + "_n" + get_fixed_str(net_id)
+                              + "_" + get_fixed_str(m_iter, 5) + ".graph");
+            dump_graph_init(ofs, *prog, [this](const primitive_id& id) -> std::shared_ptr<const primitive_inst> {
+                return m_network.get_primitive(id);
+            });
+        }
+    }
+
+    if (debug_config->dump_memory_pool > 0) {
+        auto& iters = debug_config->dump_memory_pool_iters;
+        if (iters.empty() || iters.find(m_iter) != iters.end()) {
+            dump_memory_pool(debug_config->dump_memory_pool_path, m_iter);
+            GPU_DEBUG_COUT << "============================================================================" << std::endl;
+        }
+    }
+
+    m_network.iteration++;
+}
+
+void NetworkDebugHelper::dump_memory_pool(std::string dump_path, int64_t curr_iter) const {
+    m_network.get_memory_pool().dump(m_network.get_id(), curr_iter, dump_path);
+    auto get_constants_mem_size = [&](allocation_type type) -> size_t {
+        size_t mem_size = 0;
+        for (auto& prim : m_network._primitives) {
+            if (prim.second->get_node().is_constant()) {
+                for (size_t i = 0; i < prim.second->outputs_memory_count(); i++) {
+                    if (prim.second->output_memory_ptr(i)->get_allocation_type() == type)
+                        mem_size += prim.second->output_memory_ptr(i)->size();
+                }
+            }
+        }
+        return mem_size;
+    };
+    auto get_variables_mem_size = [&](allocation_type type) -> size_t {
+        size_t mem_size = 0;
+        for (auto& var : m_network.get_variables()) {
+            if (var.second->get_memory() && var.second->get_memory()->get_allocation_type() == type)
+                mem_size += var.second->get_actual_mem_size();
+        }
+        return mem_size;
+    };
+    auto get_mb_size = [&](int64_t size) -> std::string {
+        if (size == 0) return "0 MB";
+        return std::to_string(static_cast<float>(size) / (1024 * 1024)) + " MB";
+    };
+    int64_t usm_host_const_mem_size     = get_constants_mem_size(allocation_type::usm_host);
+    int64_t usm_device_const_mem_size   = get_constants_mem_size(allocation_type::usm_device);
+    int64_t usm_host_var_mem_size       = get_variables_mem_size(allocation_type::usm_host);
+    int64_t usm_device_var_mem_size     = get_variables_mem_size(allocation_type::usm_device);
+    int64_t host_mem_size               = m_network.get_engine().get_used_device_memory(allocation_type::usm_host);
+    int64_t device_mem_size             = m_network.get_engine().get_used_device_memory(allocation_type::usm_device);
+    int64_t usm_host_mem_pool_size      = m_network.get_memory_pool().get_total_mem_pool_size(allocation_type::usm_host);
+    int64_t usm_host_etc_size           = host_mem_size - usm_host_mem_pool_size
+                                            - usm_host_const_mem_size - usm_host_var_mem_size;
+    int64_t usm_device_mem_pool_size    = m_network.get_memory_pool().get_total_mem_pool_size(allocation_type::usm_device);
+    int64_t usm_device_etc_size         = device_mem_size - usm_device_mem_pool_size
+                                            - usm_device_const_mem_size - usm_device_var_mem_size;
+    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
+    GPU_DEBUG_COUT << "Memory statistics for (net_id:" << m_network.get_id() << ", iter:" << curr_iter << ")" << std::endl;
+    GPU_DEBUG_COUT << " Total host mem size     : " << get_mb_size(host_mem_size)               << std::endl;
+    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_host_mem_pool_size)      << std::endl;
+    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_host_const_mem_size)     << std::endl;
+    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_host_var_mem_size)       << std::endl;
+    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_host_etc_size)           << std::endl;
+    GPU_DEBUG_COUT << " Total device mem size   : " << get_mb_size(device_mem_size)             << std::endl;
+    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_device_mem_pool_size)    << std::endl;
+    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_device_const_mem_size)   << std::endl;
+    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_device_var_mem_size)     << std::endl;
+    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_device_etc_size)         << std::endl;
+    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
+}
+
+}  // namespace cldnn
+
+#endif // GPU_DEBUG_CONFIG
diff --git a/src/plugins/intel_gpu/src/graph/debug_helper.hpp b/src/plugins/intel_gpu/src/graph/debug_helper.hpp
new file mode 100644
index 00000000000000..c7c6bd006af1db
--- /dev/null
+++ b/src/plugins/intel_gpu/src/graph/debug_helper.hpp
@@ -0,0 +1,69 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include "intel_gpu/graph/network.hpp"
+#include "intel_gpu/graph/program.hpp"
+#include "intel_gpu/runtime/stream.hpp"
+#include "intel_gpu/runtime/debug_configuration.hpp"
+#include "primitive_inst.h"
+
+namespace cldnn {
+
+#ifdef GPU_DEBUG_CONFIG
+
+class NodeDebugHelper {
+public:
+    NodeDebugHelper(const primitive_inst& inst);
+    ~NodeDebugHelper();
+
+private:
+    std::string get_iteration_prefix() {
+        if (m_iter < 0)
+            return std::string("");
+        return std::to_string(m_iter) + "_";
+    }
+
+    std::string get_file_prefix() {
+        auto prog_id = ((m_program != nullptr) ? m_program->get_id() : 0);
+        auto net_id = m_network.get_id();
+
+        return "program" + std::to_string(prog_id) + "_network" + std::to_string(net_id) + "_" + get_iteration_prefix() + m_inst.id();
+    }
+
+
+    const primitive_inst& m_inst;
+    stream& m_stream;
+    const network& m_network;
+    const program* m_program;
+    const size_t m_iter;
+
+    const debug_configuration* debug_config = cldnn ::debug_configuration ::get_instance();
+};
+
+class NetworkDebugHelper {
+public:
+    NetworkDebugHelper(const network& net);
+    ~NetworkDebugHelper();
+
+private:
+    void dump_memory_pool(std::string dump_path, int64_t curr_iter) const;
+    const network& m_network;
+    const size_t m_iter;
+
+    const debug_configuration* debug_config = cldnn ::debug_configuration ::get_instance();
+};
+
+#define NETWORK_DEBUG(net) NetworkDebugHelper __network_debug_helper(net)
+#define NODE_DEBUG(inst) NodeDebugHelper __node_debug_helper(inst)
+
+#else
+
+#define NETWORK_DEBUG(...)
+#define NODE_DEBUG(...)
+
+#endif  // GPU_DEBUG_CONFIG
+
+}  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h b/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h
index 075422a4196b38..cf5111de6b247e 100644
--- a/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h
+++ b/src/plugins/intel_gpu/src/graph/include/program_dump_graph.h
@@ -14,6 +14,6 @@ std::string get_dir_path(const ExecutionConfig& config);
 void dump_graph_optimized(std::ofstream&, const program&);
 void dump_graph_processing_order(std::ofstream&, const program&);
 void dump_graph_init(std::ofstream&, const program&,
-                     std::function<std::shared_ptr<primitive_inst>(const primitive_id&)> get_primitive_inst = nullptr);
+                     std::function<std::shared_ptr<const primitive_inst>(const primitive_id&)> get_primitive_inst = nullptr);
 void dump_graph_info(std::ofstream&, const program&);
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index ae64846a0c9b5e..57f2fb41c7cc06 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -1089,11 +1089,13 @@ format layout_optimizer::get_expected_format(quantize_node const& node) {
     auto use_onednn_impls = _optimization_attributes.use_onednn_impls;
 
     if (use_onednn_impls) {
-        auto& user = node.get_users().front();
-        if (user != nullptr && user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) {
-            expected = user->get_preferred_input_fmt(user->get_dependency_index(node));
-        } else {
-            expected = format::any;
+        expected = format::any;
+        auto& users = node.get_users();
+        if (users.size() != 0) {
+            auto& user = users.front();
+            if (user != nullptr && user->get_preferred_input_fmt(user->get_dependency_index(node)) != format::any) {
+                expected = user->get_preferred_input_fmt(user->get_dependency_index(node));
+            }
         }
     } else if (only_gemm_users(node)) {
         // TODO: Gemm is not supporting fsv layouts
diff --git a/src/plugins/intel_gpu/src/graph/network.cpp b/src/plugins/intel_gpu/src/graph/network.cpp
index 92d62782828d78..8f0e97dd51ee12 100644
--- a/src/plugins/intel_gpu/src/graph/network.cpp
+++ b/src/plugins/intel_gpu/src/graph/network.cpp
@@ -4,7 +4,6 @@
 
 #include "intel_gpu/plugin/variable_state.hpp"
 #include "intel_gpu/primitives/read_value.hpp"
-#include "openvino/util/file_util.hpp"
 
 #include "intel_gpu/primitives/data.hpp"
 #include "intel_gpu/primitives/mutable_data.hpp"
@@ -31,13 +30,10 @@
 #include "deconvolution_inst.h"
 #include "mutable_data_inst.h"
 #include "condition_inst.h"
-#include "loop_inst.h"
-#include "assign_inst.h"
 #include "read_value_inst.h"
 #include "reshape_inst.h"
 #include "kv_cache_inst.h"
 #include "program_helpers.h"
-#include "to_string_utils.h"
 #include "program_dump_graph.h"
 
 #include <algorithm>
@@ -51,8 +47,8 @@
 #include <functional>
 #include <fstream>
 
+#include "debug_helper.hpp"
 #ifdef GPU_DEBUG_CONFIG
-#include <iomanip>
 #include <fstream>
 #include <sys/stat.h>
 #include <chrono>
@@ -60,7 +56,6 @@
 #endif
 
 namespace cldnn {
-
 namespace {
 
 #ifdef GPU_DEBUG_CONFIG
@@ -143,179 +138,6 @@ void dump_perf_data_raw(std::string dump_path, const std::list<std::shared_ptr<p
     }
 }
 
-float convert_element(int64_t i) { return static_cast<float>(i); }
-float convert_element(int32_t i) { return static_cast<float>(i); }
-
-float convert_element(float f) { return f; }
-
-float convert_element(ov::float16 h) { return static_cast<float>(h); }
-
-size_t get_x_pitch(const layout& layout) {
-    try {
-        auto tensor_x0 = tensor(batch(0), feature(0), spatial(0, 0, 0, 0));
-        auto tensor_x1 = tensor(batch(0), feature(0), spatial(1, 0, 0, 0));
-        auto x0 = layout.get_linear_offset(tensor_x0);
-        auto x1 = layout.get_linear_offset(tensor_x1);
-        return (x1 - x0);
-    } catch (...) {
-        // When spatial size of x=0, x_pitch is meaningless
-        return 0;
-    }
-}
-
-template <class T>
-void dump(memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
-    auto&& size = mem->get_layout().get_tensor();
-
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
-    tensor tmp_size(size);
-    tmp_size.batch[0] = batch_size;
-    if (tmp_size == size) {
-        file_stream << "shape: " << size.to_string() << " ";
-        file_stream << "(count: " << size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    } else {
-        file_stream << "shape: " << tmp_size.to_string() << " ";
-        file_stream << "(count: " << tmp_size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
-                    << ", original shape: " << size.to_string() << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    }
-
-    if (size.count() == 0) {
-        file_stream << "Empty buffer" << std::endl;
-        return;
-    }
-
-    mem_lock<T, mem_lock_type::read> lock(mem, stream);
-    auto mem_ptr = lock.data();
-    auto x_pitch = get_x_pitch(mem->get_layout());
-    std::stringstream buffer;
-
-    if (!dump_raw) {
-        for (cldnn::tensor::value_type g = 0; g < size.group[0]; ++g) {
-            for (cldnn::tensor::value_type b = 0; b < batch_size; ++b) {
-                for (cldnn::tensor::value_type f = 0; f < size.feature[0]; ++f) {
-                    for (cldnn::tensor::value_type w = 0; w < size.spatial[3]; ++w) {
-                        for (cldnn::tensor::value_type z = 0; z < size.spatial[2]; ++z) {
-                            for (cldnn::tensor::value_type y = 0; y < size.spatial[1]; ++y) {
-                                cldnn::tensor t(cldnn::group(g), cldnn::batch(b), cldnn::feature(f), cldnn::spatial(0, y, z, w));
-                                size_t input_it = mem->get_layout().get_linear_offset(t);
-
-                                for (cldnn::tensor::value_type x = 0; x < size.spatial[0]; ++x, input_it += x_pitch) {
-                                    buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[input_it]) << std::endl;
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-    } else {
-        for (size_t i = 0; i < lock.size(); ++i) {
-            buffer << std::fixed << std::setprecision(6) << convert_element(mem_ptr[i]) << std::endl;
-        }
-    }
-    file_stream << buffer.str();
-}
-
-void unpack(cldnn::data_types type, uint8_t input, int8_t &v0, int8_t &v1) {
-    if (type == cldnn::data_types::i4) {
-        char s_bit = (input & 0x08);
-        char mask = s_bit > 0 ? 0xF0 : 0x00;
-        v0 = (input & 0x0F) | mask;
-
-        input >>= 4;
-        s_bit = (input & 0x08);
-        mask = s_bit > 0 ? 0xF0 : 0x00;
-        v1 = (input & 0x0F) | mask;
-    } else if (type == cldnn::data_types::u4) {
-        v0 = input & 0x0F;
-        v1 = input >> 4;
-    } else {
-        OPENVINO_ASSERT(false, "not supported unpacking");
-    }
-}
-
-void dump_i4u4(cldnn::data_types type, memory::ptr mem, stream& stream, std::ofstream& file_stream, bool dump_raw) {
-    auto&& size = mem->get_layout().get_tensor();
-
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    auto batch_size = std::max(std::min(debug_config->dump_layers_limit_batch, size.batch[0]), 1);
-    tensor tmp_size(size);
-    tmp_size.batch[0] = batch_size;
-    if (tmp_size == size) {
-        file_stream << "shape: " << size.to_string() << " ";
-        file_stream << "(count: " << size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format) << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    } else {
-        file_stream << "shape: " << tmp_size.to_string() << " ";
-        file_stream << "(count: " << tmp_size.count()
-                    << ", original format: " << cldnn::fmt_to_str(mem->get_layout().format)
-                    << ", original shape: " << size.to_string() << ")"
-                    << (dump_raw ? " raw data" : "") << std::endl;
-    }
-
-    if (size.count() == 0) {
-        file_stream << "Empty buffer" << std::endl;
-        return;
-    }
-
-    mem_lock<uint8_t, mem_lock_type::read> lock(mem, stream);
-    auto mem_ptr = lock.data();
-    std::stringstream buffer;
-
-    if (dump_raw) {
-        for (size_t i = 0; i < lock.size(); ++i) {
-            int8_t v0, v1;
-            unpack(type, mem_ptr[i], v0, v1);
-            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v0) << std::endl;
-            buffer << std::fixed << std::setprecision(6) << static_cast<int>(v1) << std::endl;
-        }
-    } else {
-        std::cout << __func__ << " supports raw dump only" << std::endl;
-    }
-    file_stream << buffer.str();
-}
-
-void log_memory_to_file(memory::ptr mem, layout data_layout, stream& stream, std::string layerName, bool dump_raw) {
-    std::cout << "Dump " << (dump_raw ? "raw " : "") << layerName << std::endl;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    std::string filename = debug_config->get_name_for_dump(layerName);
-    filename = debug_config->dump_layers_path + filename + ".txt";
-    std::ofstream file_stream(filename);
-    if (!mem) {
-        file_stream << "Empty" << std::endl;
-        return;
-    }
-
-    // Reinterpret buffer to represent actual data layout
-    auto actual_mem = mem->get_engine()->reinterpret_buffer(*mem, data_layout);
-
-    auto mem_dt = actual_mem->get_layout().data_type;
-    if (mem_dt == cldnn::data_types::f32)
-        dump<float>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::f16)
-        dump<ov::float16>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i64)
-        dump<int64_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i32)
-        dump<int32_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i8)
-        dump<int8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::u8)
-        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::u8)
-        dump<uint8_t>(actual_mem, stream, file_stream, dump_raw);
-    else if (mem_dt == cldnn::data_types::i4 || mem_dt == cldnn::data_types::u4)
-        dump_i4u4(mem_dt, actual_mem, stream, file_stream, dump_raw);
-    else
-        std::cout << "Dump for this data type is not supported: " << dt_to_str(mem_dt) << std::endl;
-}
-
 void wait_for_the_turn() {
     GPU_DEBUG_GET_INSTANCE(debug_config);
     bool need_to_wait;
@@ -336,7 +158,6 @@ void wait_for_the_turn() {
 
 #else
 void dump_perf_data_raw(std::string, const std::list<std::shared_ptr<primitive_inst>>&) {}
-void log_memory_to_file(memory::ptr, layout, stream&, std::string, bool dump_raw) {}
 void wait_for_the_turn() {}
 #endif
 }  // namespace
@@ -346,25 +167,6 @@ static uint32_t get_unique_net_id() {
     return ++id_gen;
 }
 
-static std::string get_file_path_for_binary_dump(cldnn::layout layout, std::string name) {
-    std::string filename;
-    std::string data_type = ov::element::Type(layout.data_type).get_type_name();
-    std::string format = layout.format.to_string();
-    std::string tensor;
-    auto dims = layout.get_dims();
-    for (size_t r = 0 ; r < layout.get_rank() ; r++) {
-        tensor += ("_" + to_string(dims[r]));
-    }
-
-#ifdef GPU_DEBUG_CONFIG
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-    std::string layer_name = debug_config->get_name_for_dump(name);
-    filename = debug_config->dump_layers_path + layer_name
-                + "__" + data_type + "_" + tensor + "__" + format + ".bin";
-#endif
-    return filename;
-}
-
 /*
 Network will always have net_id = 0 when it will be cldnn internal micronetwork (created i.e by propagate_constants
 opt pass).
@@ -939,28 +741,10 @@ std::map<primitive_id, network_output> network::execute(const std::vector<event:
 
 void network::execute_impl(const std::vector<event::ptr>& events) {
     OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "NetworkImpl::Execute");
-    int64_t curr_iter = -1;
-    GPU_DEBUG_GET_INSTANCE(debug_config);
-#ifdef GPU_DEBUG_CONFIG
-    curr_iter = iteration;
-#endif
+    NETWORK_DEBUG(*this);
 
     // Wait for previous execution completion
     reset_execution(false);
-    GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) {
-        auto& iters = debug_config->dump_memory_pool_iters;
-        if (iters.empty() || iters.find(curr_iter) != iters.end()) {
-            GPU_DEBUG_COUT << "============================================================================" << std::endl;
-            GPU_DEBUG_COUT << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
-            if (curr_iter == 0 && get_id() > 0) {
-                dump_memory_pool(debug_config->dump_memory_pool_path, curr_iter);
-                GPU_DEBUG_COUT << "============================================================================" << std::endl;
-            }
-        }
-    } else {
-        GPU_DEBUG_TRACE << "============================================================================" << std::endl;
-        GPU_DEBUG_TRACE << "Start network execution (net_id : " << get_id() << ", iter :" << curr_iter << ")" << std::endl;
-    }
 
     std::vector<memory::ptr> in_out_mem;
     auto is_surface_lock_check_needed = [&](const shared_mem_type& shared_mem_type) {
@@ -996,33 +780,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
     auto surf_lock = surfaces_lock::create(get_engine().type(), in_out_mem, get_stream());
 
     set_arguments();
-    GPU_DEBUG_IF(debug_config->list_layers == 1) {
-        for (auto& inst : _exec_order) {
-            GPU_DEBUG_COUT << inst->id() << std::endl;
-            if (inst->get_node().is_type<loop>()) {
-                auto& loop_node = inst->get_node().as<loop>();
-                for (auto& prim : loop_node.get_body_program()->get_processing_order()) {
-                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
-                }
-            } else if (inst->get_node().is_type<condition>()) {
-                auto& cond_node = inst->get_node().as<condition>();
-                GPU_DEBUG_COUT << "* Branch_True" << std::endl;
-                for (auto& prim : cond_node.get_branch_true().inner_program->get_processing_order()) {
-                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
-                }
-                GPU_DEBUG_COUT << "* Branch_False" << std::endl;
-                for (auto& prim : cond_node.get_branch_false().inner_program->get_processing_order()) {
-                    GPU_DEBUG_COUT << "\t" << prim->id() << std::endl;
-                }
-            }
-        }
-        if (!is_internal()) exit(0);
-    }
-    auto get_iteration_prefix = [](int64_t iter) {
-        if (iter < 0)
-            return std::string("");
-        return std::to_string(iter) + "_";
-    };
 
     // This extra flush command is needed for dynamic models in both cases of out_of_order / in_order operating mode
     // since it reduces `bubbles` number in pipeline and GPU's idle time by timely flushing new kernels to device.
@@ -1033,233 +790,43 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
     size_t executed_prims = 0;
 
     for (auto& inst : _exec_order) {
-        // Load binary dump for input layers
-        GPU_DEBUG_IF(!debug_config->load_layers_raw_dump.empty()) {
-            const std::string layer_name = inst->id();
-            auto files = debug_config->get_filenames_for_matched_layer_loading_binaries(layer_name);
-            if (!files.empty()) {
-                if (inst->is_input()) {
-                    // Loading binary dumps for output tensors of input-layers : only one output exists or index(dstN) exists
-                    auto dump_file = debug_config->get_matched_from_filelist(files, "_dst0__");
-                    OPENVINO_ASSERT((files.size() == 1 || dump_file.length() != 0), "Unexpected binary dump for input layer");
-
-                    OPENVINO_ASSERT(files.size() == get_primitive(inst->id())->outputs_memory_count(), "Mis-match dump file count");
-
-                    for (size_t i = 0; i < get_primitive(inst->id())->outputs_memory_count(); i++) {
-                        auto dump_file = files[0];
-                        if (files.size() > 1 || get_primitive(inst->id())->outputs_memory_count() != 1) {
-                            std::string pattern = "_dst" + std::to_string(i) + "__";
-                            dump_file = debug_config->get_matched_from_filelist(files, pattern);
-                        }
-                        OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_dst[N]__' for binary dump");
-                        GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for " << layer_name << std::endl;
-
-                        std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
-                        OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
-
-                        auto output_mem = get_primitive(layer_name)->output_memory_ptr(i);
-                        OPENVINO_ASSERT(output_mem->size() == bin.size(), "memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
-                                        + "\n Expected size : " + to_string(output_mem->size()) + ", Binary : " + to_string(bin.size()));
-
-                        output_mem->copy_from(get_stream(), static_cast<void *>(&bin[0]), true);
-                    }
-                } else {
-                    auto check_dst = debug_config->get_matched_from_filelist(files, "_dst0__");
-                    OPENVINO_ASSERT(check_dst.length() == 0, "Expected to load binaries for inputs of " + layer_name);
-
-                    // Loading input tensors for any layer
-                    auto dump_file = debug_config->get_matched_from_filelist(files, "_src0__");
-                    OPENVINO_ASSERT(dump_file.length() != 0, "Could not find expected pattern '_src[N]__' for binary dump input : " + layer_name);
-
-                    for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                        auto dump_file = files[0];
-                        if (files.size() > 1 || get_primitive(inst->id())->dependencies().size() != 1) {
-                            std::string pattern = "_src" + std::to_string(i) + "__";
-                            dump_file = debug_config->get_matched_from_filelist(files, pattern);
-                        }
-                        if (dump_file.length() == 0) {
-                            GPU_DEBUG_COUT  << " Skip loading for  input(" << i << ") of " << layer_name << std::endl;
-                            continue;
-                        }
-                        OPENVINO_ASSERT((dump_file.length() > 0), "Could not find expected pattern '_src[N]__' for binary dump input");
-                        GPU_DEBUG_COUT  << " Load binary dump : " << dump_file << " for input(" << i << ") of " << layer_name << std::endl;
-
-                        std::vector<uint8_t> bin = ov::util::load_binary(dump_file);
-                        OPENVINO_ASSERT(!bin.empty(), "Failure loading binary from OV_GPU_LoadDumpRawBinary : " + dump_file);
-
-                        auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i);
-                        if (input_mem->size() != bin.size()) {
-                            std::cout << "WARNING: memory size mis-match for OV_GPU_LoadDumpRawBinary : " + layer_name
-                                      << "  " << input_mem->size() << " / " << bin.size() << std::endl;
-                            bin.resize(input_mem->size());
-                        }
-
-                        input_mem->copy_from(get_stream(), static_cast<void *>(&bin[0]), true);
-                    }
-                }
-            }
-        }
-
-        // Dump input buffers of 'inst'
-        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
-            const std::string layer_name = inst->id();
-
-            GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->dump_layers_dst_only == 0 && debug_config->is_layer_for_dumping(layer_name)) {
-                std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\"" + layer_name + ":";
-                for (size_t i = 0; i < get_primitive(inst->id())->dependencies().size(); i++) {
-                    std::string name = "program" + std::to_string((get_program() != nullptr) ? get_program()->get_id() : 0) +
-                                        "_network" + std::to_string(get_id()) +
-                                        "_" + get_iteration_prefix(curr_iter) +
-                                        layer_name + "_src" + std::to_string(i);
-                    auto input_mem = get_primitive(inst->id())->dep_memory_ptr(i);
-                    if (input_mem == nullptr) {
-                        GPU_DEBUG_COUT  << " input_mem_" << i << " is nullptr. Nothing to dump." << std::endl;
-                        continue;
-                    }
-
-                    auto dep = inst->dependencies().at(i);
-                    auto input_layout = dep.first->get_output_layout(dep.second);
-                    GPU_DEBUG_IF(debug_config->dump_layers_binary) {
-                        // Binary dump : raw
-                        auto filename = get_file_path_for_binary_dump(input_layout, name);
-
-                        mem_lock<char, mem_lock_type::read> lock(input_mem, get_stream());
-                        ov::util::save_binary(filename, lock.data(), input_mem->size());
-                        GPU_DEBUG_COUT  << " Dump layer src : " << layer_name << " to " << filename << std::endl;
-                        debug_str_for_bin_load += (filename + ",");
-                    } else {
-                        log_memory_to_file(input_mem,
-                                        input_layout,
-                                        get_stream(),
-                                        name,
-                                        debug_config->dump_layers_raw);
-                    }
-                }
-
-                GPU_DEBUG_IF(debug_config->dump_layers_binary && !inst->is_input()) {
-                    debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
-                    GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
-                }
-            }
-        }
+        NODE_DEBUG(*inst);
 
         execute_primitive(inst, events);
         executed_prims++;
         if (needs_flushing && executed_prims % flush_frequency == 0)
             get_stream().flush();
-
-        // Dump output buffers of 'inst'
-        GPU_DEBUG_IF(debug_config->dump_layers_path.length() > 0) {
-            get_stream().finish();
-            const std::string layer_name = inst->id();
-            auto prog_id = ((get_program() != nullptr) ? get_program()->get_id() : 0);
-            auto net_id = get_id();
-            GPU_DEBUG_IF(debug_config->is_target_iteration(curr_iter) &&
-                        debug_config->is_layer_for_dumping(layer_name, inst->is_output(), inst->is_input())) {
-                std::string debug_str_for_bin_load = " Command for loading : OV_GPU_LoadDumpRawBinary=\""
-                                                        + layer_name + ":";
-                for (size_t i = 0; i < get_primitive(layer_name)->outputs_memory_count(); i++) {
-                    std::string name = "program" + std::to_string(prog_id) +
-                                        "_network" + std::to_string(net_id) +
-                                        "_" + get_iteration_prefix(curr_iter) +
-                                        layer_name + "_dst" + std::to_string(i);
-                    auto output_mem = get_primitive(layer_name)->output_memory_ptr(i);
-                    if (output_mem == nullptr) {
-                        GPU_DEBUG_COUT  << " output_mem is nullptr. Nothing to dump." << std::endl;
-                        continue;
-                    }
-
-                    GPU_DEBUG_IF(debug_config->dump_layers_binary) {
-                        // Binary dump : raw
-                        auto output_layout = inst->get_output_layout(i);
-                        auto filename = get_file_path_for_binary_dump(output_layout, name);
-
-                        mem_lock<char, mem_lock_type::read> lock(output_mem, get_stream());
-                        ov::util::save_binary(filename, lock.data(), output_mem->size());
-                        GPU_DEBUG_COUT  << " Dump layer dst : " << layer_name << " to " << filename << std::endl;
-                        debug_str_for_bin_load += (filename + ",");
-                    } else {
-                        // Text dump
-                        log_memory_to_file(output_mem, inst->get_output_layout(i), get_stream(), name, debug_config->dump_layers_raw);
-                    }
-                }
-
-                GPU_DEBUG_IF(debug_config->dump_layers_binary && inst->is_input()) {
-                    debug_str_for_bin_load[debug_str_for_bin_load.size()-1] = '\"';
-                    GPU_DEBUG_COUT << debug_str_for_bin_load << std::endl;;
-                }
-            }
-        }
-    }
-
-    // print '-data_shape' option for benchmark_app
-    GPU_DEBUG_IF(debug_config->print_input_data_shapes == 1) {
-        std::stringstream data_shape_str;
-        auto add_string = [&data_shape_str](std::string str) {
-            data_shape_str << ((data_shape_str.rdbuf()->in_avail() == 0) ? " -data_shape " : ",") << str;
-        };
-
-        for (auto& inst : _exec_order) {
-            auto name = inst->id();
-            auto pos = name.find(':');
-            auto type = name.substr(0, pos);
-            name.erase(0, pos + 1);
-            if (inst->is_input() && type == "parameter") {
-                add_string(name + inst->get_output_layout().get_partial_shape().to_string());
-            }
-        }
-
-        GPU_DEBUG_COUT << "[program:" << std::setw(2) << ((get_program() != nullptr) ? get_program()->get_id() : 0)
-                       << "|network:" << std::setw(2) << get_id() << "|iter:" << std::setw(4) << curr_iter <<  "] benchmark_app cmd: "
-                       << data_shape_str.str() << std::endl;
-    }
-
-    GPU_DEBUG_IF(!debug_config->dump_graphs.empty() && debug_config->is_target_iteration(curr_iter)) {
-        auto get_fixed_str = [](int value, int length = 2) -> std::string {
-            std::ostringstream ss;
-            ss << std::setw(length) << std::setfill('0') << std::to_string(value);
-            return ss.str();
-        };
-        std::string path = get_dir_path(get_config());
-        if (!path.empty()) {
-            std::ofstream ofs(path + "cldnn_program_exec_p" + get_fixed_str(get_program()->get_id()) + "_n" + get_fixed_str(get_id())
-                              + "_" + get_fixed_str(curr_iter, 5) + ".graph");
-            dump_graph_init(ofs, *get_program(), [&](const primitive_id& id) -> std::shared_ptr<primitive_inst> {
-                return get_primitive(id);
-            });
-        }
     }
 
     // Store events only in case of OOO queue or enabled Profiling
     auto store_events = is_out_of_order_queue || _enable_profiling;
     if (store_events) {
         if (_program != nullptr) {
-        for (auto& inst : _program->get_processing_order()) {
-            // Special handling for mutable data. The event should be the same as the user or dependency with highest
-            // processing_num as the mutable_data can be updated when is both user or dependency.
-            if (inst->is_type<mutable_data>()) {
-                decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0;
-                for (auto& user : inst->get_users()) {
-                    auto user_proc_num = _program->get_processing_order().get_processing_number(user);
-                    if (user_proc_num > proc_num) {
-                        _events[inst->id()] = _events[user->id()];
-                        proc_num = user_proc_num;
+            for (auto& inst : _program->get_processing_order()) {
+                // Special handling for mutable data. The event should be the same as the user or dependency with highest
+                // processing_num as the mutable_data can be updated when is both user or dependency.
+                if (inst->is_type<mutable_data>()) {
+                    decltype(_program->get_processing_order().get_processing_number(inst)) proc_num = 0;
+                    for (auto& user : inst->get_users()) {
+                        auto user_proc_num = _program->get_processing_order().get_processing_number(user);
+                        if (user_proc_num > proc_num) {
+                            _events[inst->id()] = _events[user->id()];
+                            proc_num = user_proc_num;
+                        }
                     }
-                }
 
-                if (!inst->get_dependencies().empty()) {
-                    for (auto& dep : inst->get_dependencies()) {
-                        auto dep_proc_num = _program->get_processing_order().get_processing_number(dep.first);
-                        if (dep_proc_num > proc_num) {
-                            _events[inst->id()] = _events[dep.first->id()];
-                            proc_num = dep_proc_num;
+                    if (!inst->get_dependencies().empty()) {
+                        for (auto& dep : inst->get_dependencies()) {
+                            auto dep_proc_num = _program->get_processing_order().get_processing_number(dep.first);
+                            if (dep_proc_num > proc_num) {
+                                _events[inst->id()] = _events[dep.first->id()];
+                                proc_num = dep_proc_num;
+                            }
                         }
                     }
                 }
             }
         }
-        }
 
         for (auto& dout : _data_outputs) {  // data primitives are not executed so if they are marked as output we need to add
                                             // them valid events manually
@@ -1278,73 +845,6 @@ void network::execute_impl(const std::vector<event::ptr>& events) {
 
     // Deallocate events from the previos iteration
     _old_events.clear();
-
-    GPU_DEBUG_IF(debug_config->dump_memory_pool > 0) {
-        auto& iters = debug_config->dump_memory_pool_iters;
-        if (iters.empty() || iters.find(curr_iter) != iters.end()) {
-            dump_memory_pool(debug_config->dump_memory_pool_path, curr_iter);
-            GPU_DEBUG_COUT << "============================================================================" << std::endl;
-        }
-    }
-
-#ifdef GPU_DEBUG_CONFIG
-    iteration++;
-#endif
-}
-
-void network::dump_memory_pool(std::string dump_path, int64_t curr_iter) {
-#ifdef GPU_DEBUG_CONFIG
-    get_memory_pool().dump(get_id(), curr_iter, dump_path);
-    auto get_constants_mem_size = [&](allocation_type type) -> size_t {
-        size_t mem_size = 0;
-        for (auto& prim : _primitives) {
-            if (prim.second->get_node().is_constant()) {
-                for (size_t i = 0; i < prim.second->outputs_memory_count(); i++) {
-                    if (prim.second->output_memory_ptr(i)->get_allocation_type() == type)
-                        mem_size += prim.second->output_memory_ptr(i)->size();
-                }
-            }
-        }
-        return mem_size;
-    };
-    auto get_variables_mem_size = [&](allocation_type type) -> size_t {
-        size_t mem_size = 0;
-        for (auto& var : get_variables()) {
-            if (var.second->get_memory() && var.second->get_memory()->get_allocation_type() == type)
-                mem_size += var.second->get_actual_mem_size();
-        }
-        return mem_size;
-    };
-    auto get_mb_size = [&](int64_t size) -> std::string {
-        if (size == 0) return "0 MB";
-        return std::to_string(static_cast<float>(size) / (1024 * 1024)) + " MB";
-    };
-    int64_t usm_host_const_mem_size     = get_constants_mem_size(allocation_type::usm_host);
-    int64_t usm_device_const_mem_size   = get_constants_mem_size(allocation_type::usm_device);
-    int64_t usm_host_var_mem_size       = get_variables_mem_size(allocation_type::usm_host);
-    int64_t usm_device_var_mem_size     = get_variables_mem_size(allocation_type::usm_device);
-    int64_t host_mem_size               = get_engine().get_used_device_memory(allocation_type::usm_host);
-    int64_t device_mem_size             = get_engine().get_used_device_memory(allocation_type::usm_device);
-    int64_t usm_host_mem_pool_size      = get_memory_pool().get_total_mem_pool_size(allocation_type::usm_host);
-    int64_t usm_host_etc_size           = host_mem_size - usm_host_mem_pool_size
-                                            - usm_host_const_mem_size - usm_host_var_mem_size;
-    int64_t usm_device_mem_pool_size    = get_memory_pool().get_total_mem_pool_size(allocation_type::usm_device);
-    int64_t usm_device_etc_size         = device_mem_size - usm_device_mem_pool_size
-                                            - usm_device_const_mem_size - usm_device_var_mem_size;
-    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
-    GPU_DEBUG_COUT << "Memory statistics for (net_id:" << get_id() << ", iter:" << curr_iter << ")" << std::endl;
-    GPU_DEBUG_COUT << " Total host mem size     : " << get_mb_size(host_mem_size)               << std::endl;
-    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_host_mem_pool_size)      << std::endl;
-    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_host_const_mem_size)     << std::endl;
-    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_host_var_mem_size)       << std::endl;
-    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_host_etc_size)           << std::endl;
-    GPU_DEBUG_COUT << " Total device mem size   : " << get_mb_size(device_mem_size)             << std::endl;
-    GPU_DEBUG_COUT << " * Memory pool           : " << get_mb_size(usm_device_mem_pool_size)    << std::endl;
-    GPU_DEBUG_COUT << " * Constant              : " << get_mb_size(usm_device_const_mem_size)   << std::endl;
-    GPU_DEBUG_COUT << " * Variable              : " << get_mb_size(usm_device_var_mem_size)     << std::endl;
-    GPU_DEBUG_COUT << " * ETC                   : " << get_mb_size(usm_device_etc_size)         << std::endl;
-    GPU_DEBUG_COUT << "------------------------------------------------------------------------" << std::endl;
-#endif
 }
 
 std::vector<primitive_id> network::get_input_ids() const {
diff --git a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
index bff45cd81f9900..4a2f43b28d9360 100644
--- a/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_dump_graph.cpp
@@ -170,7 +170,7 @@ std::string get_dir_path(const ExecutionConfig& config) {
 
 void dump_graph_init(std::ofstream& graph,
                      const program& program,
-                     std::function<std::shared_ptr<primitive_inst>(const primitive_id&)> get_primitive_inst) {
+                     std::function<std::shared_ptr<const primitive_inst>(const primitive_id&)> get_primitive_inst) {
     const std::string invalid_layout_msg = "(invalid layout)";
 
     const auto dump_mem_info = [&invalid_layout_msg, &get_primitive_inst](const program_node* ptr) {
diff --git a/src/plugins/intel_gpu/src/graph/program_node.cpp b/src/plugins/intel_gpu/src/graph/program_node.cpp
index 3c9ad0f7317a27..21ba4e656fae0d 100644
--- a/src/plugins/intel_gpu/src/graph/program_node.cpp
+++ b/src/plugins/intel_gpu/src/graph/program_node.cpp
@@ -611,9 +611,9 @@ bool program_node::is_padded_spatial(size_t idx) const {
     auto& layout = get_output_layout(idx);
     const auto& lower_size = layout.data_padding._lower_size;
     const auto& upper_size = layout.data_padding._upper_size;
-    return std::any_of(std::begin(lower_size) + 2, std::begin(lower_size) + layout.get_spatial_rank() - 1,
+    return std::any_of(std::begin(lower_size) + 2, std::begin(lower_size) + 2 + layout.get_spatial_rank(),
                         [](const tensor::value_type& el) { return el != 0; }) ||
-           std::any_of(std::begin(upper_size) + 2, std::begin(upper_size) + layout.get_spatial_rank() - 1,
+           std::any_of(std::begin(upper_size) + 2, std::begin(upper_size) + 2 + layout.get_spatial_rank(),
                         [](const tensor::value_type& el) { return el != 0; });
 }
 
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
index 29d322d432dd35..57545b0df37cff 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
+++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
     uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
 #endif
 
+#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+    const int power_of_two_for_simd = 5;
+    const int power_of_two_for_osv = 6;
+    const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv);
+    const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1);
+    const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd;
+    // out_f(32)  : 0  * osv_weight_stride + 32;
+    // out_f(64)  : 64 * osv_weight_stride + 0;
+    // out_f(128) : 64 * osv_weight_stride + 32;
+    // ...
+    uint weights_offset =  osv64_weight_base * osv_weight_stride + out_f_offset;
+#else
     uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2);
+#endif
 
     ACCUMULATOR_VEC_TYPE    acc[TILE_B] = { };
 
@@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
 
         __local int* char_slm_weight = (__local int*)wei_local_mem;
 
+        #if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+        uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2;
+        #else
         uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE;
+        #endif
         uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2;
 
         // DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE
@@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                 // loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
                 dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
                 dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
+            #elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
+                SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
+                SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD)));
+                DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
+                DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp;
+                dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
+                dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
+                dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01;
+                dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45;
+                dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23;
+                dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67;
             #else
                 SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
                 DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
@@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
                 acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight);
             }
 
-            #if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2
-                weights_offset += (TILE_K_OFM_PACKED/2) * SIMD;
-            #else
-                weights_offset += TILE_K_OFM_PACKED * SIMD;
-            #endif
+            weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;
 
             #if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
                 unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
index 5377387c8b497e..c4115d74f54a92 100644
--- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
+++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
     size_t tile_k_ofm_packed = tile_k_ofm;
     size_t quantize_grp_size = get_dynamic_quantize_group_size(params);
 
+    bool add_decompress_scale_post_op = false;
     WeightsType weights_dt = params.weights.GetDType();
     if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) {
         tile_k_ofm_packed /= 2;
@@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
         // Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance
         if (scale_group_size % simd == 0 && !dispatchData.use_slm)
-            jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
+            add_decompress_scale_post_op = true;
     }
     if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
         jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
@@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
     } else {
+        if (add_decompress_scale_post_op)
+            jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
         jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size));
     }
@@ -781,8 +784,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
     auto output_f = get_output_aligned_bf_size(fc_params, false).second;
 
     WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
-    // TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed
-    if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
+    if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
         && (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
         && (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
         && is_weight_horizontal(fc_params, output_f)) {
diff --git a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
index 346b4471779593..88d69dcd3e47b3 100644
--- a/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
+++ b/src/plugins/intel_gpu/src/plugin/sync_infer_request.cpp
@@ -34,9 +34,12 @@ namespace {
 inline bool can_use_usm_host(const cldnn::engine& engine) {
     auto can_use_usm = engine.use_unified_shared_memory();
 
-    if (engine.get_device_info().gfx_ver.major == 12 && engine.get_device_info().gfx_ver.minor == 60) {
-        // WA: Disable USM host memory for infer request`s tensors for PVC as
-        // it has performance issues in case of host <-> device data transfers inside kernels
+    const auto& device_info = engine.get_device_info();
+    if ((device_info.gfx_ver.major == 12 && device_info.gfx_ver.minor == 60) ||
+        (device_info.gfx_ver.major >= 20 && device_info.dev_type == cldnn::device_type::discrete_gpu)) {
+        // WA: Disable USM host memory for infer request`s tensors for PVC and subsequent dGPUs, as kernel access
+        // to system memory is slower than using an explicit memcpy (Host <-> Device) call with the copy engine
+        // Driver tickets with additional details: 6155, 10054
         GPU_DEBUG_TRACE << "Do not use usm_host for performance issue" << std::endl;
         can_use_usm = false;
     }
diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
index 43cb5ec1aef931..563e99fcf2bad9 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp
@@ -178,6 +178,7 @@ ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model,
         }
         auto process_params = [&](const ov::ParameterVector& _parameters) {
             for (size_t i = 0; i < _parameters.size(); i++) {
+                NPUW_ASSERT(_parameters[i]);
                 LOG_VERB(_parameters[i]);
                 for (size_t j = 0; j < orig_parameters.size(); j++) {
                     if (_parameters[i] == orig_parameters[j]) {
diff --git a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
index 22dfc6e103f719..192d975509ce5e 100644
--- a/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
+++ b/src/plugins/intel_npu/src/plugin/npuw/partitioning/partitioning.cpp
@@ -4,6 +4,8 @@
 
 #include "partitioning.hpp"
 
+#include <memory>
+
 #include "../logging.hpp"
 #include "../util.hpp"
 #include "intel_npu/al/config/npuw.hpp"
@@ -20,6 +22,26 @@
 #include "patterns/dcoff.hpp"
 #include "patterns/opt.hpp"
 
+namespace ov {
+namespace npuw {
+inline bool operator==(const std::reference_wrapper<Subgraph>& lhs, const std::reference_wrapper<Subgraph>& rhs) {
+    ov::npuw::Subgraph& llink = lhs.get();
+    ov::npuw::Subgraph& rlink = rhs.get();
+    return &llink == &rlink;
+}
+}  // namespace npuw
+}  // namespace ov
+
+template <typename T2>
+struct std::hash<std::pair<ov::npuw::Subgraph::Ref, T2>> {
+    std::size_t operator()(std::pair<ov::npuw::Subgraph::Ref, T2> const& p) const noexcept {
+        ov::npuw::Subgraph& sg = p.first.get();
+        std::size_t h1 = std::hash<void*>{}(&sg);
+        std::size_t h2 = std::hash<T2>{}(p.second);
+        return h1 ^ (h2 << 1);
+    }
+};
+
 namespace {
 
 class FuncallEverywhere {
@@ -161,6 +183,8 @@ class Partitioner {
 
     using PPtr = std::shared_ptr<ov::op::v0::Parameter>;
     using RPtr = std::shared_ptr<ov::op::v0::Result>;
+    using SubgParam = std::pair<ov::npuw::Subgraph::Ref, PPtr>;
+    using SubgResult = std::pair<ov::npuw::Subgraph::Ref, RPtr>;
     using LinkPtrTo = std::pair<size_t /*submodel_idx*/
                                 ,
                                 PPtr /*param ptr*/
@@ -182,8 +206,8 @@ class Partitioner {
 
         // Map every function call instance' Parameter and result
         // back to its prototype Parameter and Result
-        std::unordered_map<PPtr, PPtr> param_call_to_proto;
-        std::unordered_map<RPtr, RPtr> result_call_to_proto;
+        std::unordered_map<SubgParam, PPtr> param_call_to_proto;
+        std::unordered_map<SubgResult, RPtr> result_call_to_proto;
     };
     std::map<std::string, FunctionPipeline> all_functions;
 
@@ -203,7 +227,10 @@ class Partitioner {
     void createFunction(FunctionPipeline& func_ggg);
 
     template <typename T, typename M>
-    void rearrange_to_function_protocol(const std::vector<T>& protocol, std::vector<T>& call, const M& call_to_proto) {
+    void rearrange_to_function_protocol(ov::npuw::Subgraph::Ref func_ref,
+                                        const std::vector<T>& protocol,
+                                        std::vector<T>& call,
+                                        const M& call_to_proto) {
         LOG_DEBUG("Rearranging...");
         LOG_BLOCK();
         LOG_DEBUG("Protocol: " << protocol.size());
@@ -215,7 +242,7 @@ class Partitioner {
         LOG_DEBUG("Call: " << call.size());
         for (auto&& c : call) {
             LOG_BLOCK();
-            auto p_c = call_to_proto.at(c);
+            auto p_c = call_to_proto.at(typename M::key_type(func_ref, c));
             to_proto.push_back(p_c);
             LOG_DEBUG(c << " (which is " << p_c << ")");
         }
@@ -536,7 +563,7 @@ void Partitioner::identifySubgraphs() {
             LOG_VERB("Processing group's output layer " << output_layer_name);
             LOG_BLOCK();
             auto output_layer_ptr = node_id_cache.at(output_layer_name);
-            if (output_layer_ptr->inputs().empty()) {
+            if (output_layer_ptr->outputs().empty()) {
                 OPENVINO_THROW("The group's output layer ",
                                output_layer_name,
                                " has NO OUTPUTS!! - Graph contracts are broken??");
@@ -1327,9 +1354,12 @@ void Partitioner::matchParameters(const std::string& func_name) {
 
     // Now walk other submodels and match parameters with the same key
     // (yes, including the first one)
-    for (auto&& call : model_group) {
+    for (std::size_t call_id = 0; call_id < model_group.size(); ++call_id) {
         LOG_DEBUG("Handle function call...");
         LOG_BLOCK();
+        auto call = model_group[call_id];
+        auto subg_ref = func.refs[call_id];
+
         std::unordered_set<ov::Node*> this_model_nodes;
         for (auto&& node_ptr : call->get_ordered_ops()) {
             this_model_nodes.insert(node_ptr.get());
@@ -1348,7 +1378,7 @@ void Partitioner::matchParameters(const std::string& func_name) {
                 LOG_DEBUG("Find orig parameter for " << node);
                 auto& orig_param = proto_parameters.at(pkey);
                 auto this_param = std::dynamic_pointer_cast<PPtr::element_type>(node);
-                func.param_call_to_proto[this_param] = orig_param;
+                func.param_call_to_proto[SubgParam(subg_ref, this_param)] = orig_param;
             }
         }
     }
@@ -1386,14 +1416,16 @@ void Partitioner::matchResults(const std::string& func_name) {
 
     // Now walk all submodels and match parameters with the same key
     // (yes, including the first one)
-    for (auto&& call : model_group) {
+    for (std::size_t call_idx = 0; call_idx < model_group.size(); ++call_idx) {
+        auto call = model_group[call_idx];
+        auto subg_ref = func.refs[call_idx];
         for (auto&& node : call->get_ordered_ops()) {
             if (ov::op::util::is_output(node)) {
                 auto&& port = node->input(0).get_source_output();
                 RKey rkey = {layer_to_prototype.at(port.get_node()->get_friendly_name()), port.get_index()};
                 auto& orig_result = proto_results.at(rkey);
                 auto this_result = std::dynamic_pointer_cast<RPtr::element_type>(node);
-                func.result_call_to_proto[this_result] = orig_result;
+                func.result_call_to_proto[SubgResult(subg_ref, this_result)] = orig_result;
             }
         }
     }
@@ -1517,8 +1549,8 @@ void Partitioner::matchRepeatedSubgraphs(const std::string& func_name) {
         funcall._gflops = this_sg._gflops;          // duplicated code again!
         funcall._ops = this_sg._ops;                // duplicated code again!
         funcall._avoid_list = this_sg._avoid_list;  // duplicated code again!
-        rearrange_to_function_protocol(body_params, funcall._parameters, func_ggg.param_call_to_proto);
-        rearrange_to_function_protocol(body_results, funcall._results, func_ggg.result_call_to_proto);
+        rearrange_to_function_protocol(this_sg, body_params, funcall._parameters, func_ggg.param_call_to_proto);
+        rearrange_to_function_protocol(this_sg, body_results, funcall._results, func_ggg.result_call_to_proto);
 
         auto func_iter = P.functions.find(func_name);
         NPUW_ASSERT(func_iter != P.functions.end());
@@ -1883,7 +1915,7 @@ void Partitioner::finalizeLinks() {
             auto& params = P.functions.at(sg_desc._funcall)._model->get_parameters();
             auto& proto = func_pipeline_type == FunctionPipelineType::CWAI
                               ? ptr  // no protos in the CWAI case..
-                              : all_functions.at(sg_desc._funcall).param_call_to_proto.at(ptr);
+                              : all_functions.at(sg_desc._funcall).param_call_to_proto.at(SubgParam(sg_desc, ptr));
             auto param_iter = std::find(params.begin(), params.end(), proto);
             NPUW_ASSERT(param_iter != params.end());
             return std::distance(params.begin(), param_iter);
@@ -1904,7 +1936,7 @@ void Partitioner::finalizeLinks() {
             auto& results = P.functions.at(sg_desc._funcall)._model->get_results();
             auto& proto = func_pipeline_type == FunctionPipelineType::CWAI
                               ? ptr  // no protos in the CWAI case...
-                              : all_functions.at(sg_desc._funcall).result_call_to_proto.at(ptr);
+                              : all_functions.at(sg_desc._funcall).result_call_to_proto.at(SubgResult(sg_desc, ptr));
             auto result_iter = std::find(results.begin(), results.end(), proto);
             NPUW_ASSERT(result_iter != results.end());
             return std::distance(results.begin(), result_iter);
diff --git a/src/plugins/intel_npu/tests/unit/CMakeLists.txt b/src/plugins/intel_npu/tests/unit/CMakeLists.txt
index 5741a1e43c2a5b..861a0ff6a47076 100644
--- a/src/plugins/intel_npu/tests/unit/CMakeLists.txt
+++ b/src/plugins/intel_npu/tests/unit/CMakeLists.txt
@@ -34,12 +34,9 @@ ov_add_test_target(
             NPUW
 )
 
-if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
-    target_compile_options(${TARGET_NAME} PRIVATE -mavx2 -mf16c)
-elseif(CMAKE_CXX_COMPILER_ID MATCHES "MSVC")
-    target_compile_options(${TARGET_NAME} PRIVATE /arch:AVX2)
-else()
-    message(AUTHOR_WARNING "Unknown compiler, may miss the AVX2 baseline setting")
+if(ENABLE_AVX2)
+    ov_avx2_optimization_flags(avx2_flags)
+    target_compile_options(${TARGET_NAME} PRIVATE "${avx2_flags}")
 endif()
 
 install(TARGETS ${TARGET_NAME}
diff --git a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp
index 51285c8145ceb6..1049832f6ead7c 100644
--- a/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp
+++ b/src/plugins/intel_npu/tests/unit/npuw/unpack.cpp
@@ -2,6 +2,7 @@
 // SPDX-License-Identifier: Apache-2.0
 //
 
+#ifdef HAVE_AVX2
 #include "unpack.hpp"
 
 namespace {
@@ -98,3 +99,5 @@ INSTANTIATE_TEST_SUITE_P(UnpackTestsWithScaleAndZeroPointTest3, UnpackTestsWithS
                          UnpackTestsWithScaleAndZeroPointTest3::getTestCaseName);
 
 } // anonymous namespace
+
+#endif // __AVX2__
diff --git a/src/plugins/intel_npu/tools/single-image-test/main.cpp b/src/plugins/intel_npu/tools/single-image-test/main.cpp
index 3b3009bb5f459c..4018982b022ed3 100644
--- a/src/plugins/intel_npu/tools/single-image-test/main.cpp
+++ b/src/plugins/intel_npu/tools/single-image-test/main.cpp
@@ -1200,7 +1200,8 @@ bool computeRRMSE(const ov::Tensor& output, const ov::Tensor& reference) {
 
     double rrmseLoss = sqrt(error / sum);
 
-    std::cout << "RRMSE loss : " << rrmseLoss << "   RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl;
+    std::cout << "RRMSE loss : " << std::fixed << std::setprecision(4) << rrmseLoss
+              << "   RRMSE threshold : " << FLAGS_rrmse_loss_threshold << std::endl;
     return rrmseLoss <= FLAGS_rrmse_loss_threshold;
 }
 
@@ -1267,7 +1268,8 @@ bool computeNRMSE(const ov::Tensor& output, const ov::Tensor& reference) {
     double nrmseLoss =
             sqrt(error / size) / std::max(0.001f, std::max(maxOutput - minOutput, maxReference - minReference));
 
-    std::cout << "NRMSE loss : " << nrmseLoss << "   NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl;
+    std::cout << "NRMSE loss : " << std::fixed << std::setprecision(4) << nrmseLoss
+              << "   NRMSE threshold : " << FLAGS_nrmse_loss_threshold << std::endl;
     return nrmseLoss <= FLAGS_nrmse_loss_threshold;
 }
 
@@ -1319,7 +1321,7 @@ bool testPSNR(const TensorMap& outputs, const TensorMap& references, const int d
 
     auto result = utils::runPSNRMetric(actOutput, refOutput, dstHeight, dstWidth, scaleBorder, normalizedImage);
 
-    if (std::fabs(result - FLAGS_psnr_reference) > FLAGS_psnr_tolerance) {
+    if (FLAGS_psnr_reference - result > FLAGS_psnr_tolerance) {
         std::cout << "Absolute difference between actual value " << result << " and reference value "
                   << FLAGS_psnr_reference << " larger then tolerance " << FLAGS_psnr_tolerance << std::endl;
         return false;
diff --git a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
index 1cf458500bcc71..e55a86f279de21 100644
--- a/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
+++ b/tests/layer_tests/pytorch_tests/test_bitwise_ops.py
@@ -4,6 +4,8 @@
 import numpy as np
 import pytest
 import torch
+from packaging import version
+
 from pytorch_layer_test_class import PytorchLayerTest, skip_if_export
 
 
@@ -69,10 +71,12 @@ def forward_not_out(self, tensor_a, out):
     )
     @pytest.mark.parametrize("out", [False, skip_if_export(True)])
     def test_bitwise_mixed_dtypes(
-        self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version
+            self, op_type, out, lhs_dtype, rhs_dtype, lhs_shape, rhs_shape, ie_device, precision, ir_version
     ):
         if ie_device == "GPU" and (lhs_dtype != "bool" or rhs_dtype != "bool"):
             pytest.xfail(reason="bitwise ops are not supported on GPU")
+        if out and version.parse(np.__version__) >= version.parse("2.0.0"):
+            pytest.xfail(reason="CVS-154082: incorrect handling out type")
         self._test(
             *self.create_model(op_type, out),
             ie_device,
diff --git a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py
index f0f9085d32ba2f..e982867c9ac08d 100644
--- a/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py
+++ b/tests/layer_tests/tensorflow_tests/test_tf_ExpandDims.py
@@ -6,6 +6,7 @@
 import tensorflow as tf
 from common.tf_layer_test_class import CommonTFLayerTest
 
+rng = np.random.default_rng(62362)
 
 class TestExpandDims(CommonTFLayerTest):
     def _prepare_input(self, inputs_info):
@@ -40,3 +41,54 @@ def test_expand_dims_basic(self, params, ie_device, precision, ir_version, temp_
         self._test(*self.create_expand_dims_net(**params),
                    ie_device, precision, ir_version, temp_dir=temp_dir,
                    use_legacy_frontend=use_legacy_frontend)
+
+
+class TestExpandDimsComplex(CommonTFLayerTest):
+    def _prepare_input(self, inputs_info):
+        # generate elements so that the input tensor may contain repeating elements
+        assert 'param_real:0' in inputs_info
+        assert 'param_imag:0' in inputs_info
+
+        input_shape = inputs_info['param_real:0']
+
+        inputs_data = {}
+        inputs_data['param_real:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32)
+        inputs_data['param_imag:0'] = rng.integers(-10.0, 10.0, input_shape).astype(np.float32)
+
+        return inputs_data
+
+    def create_expand_dims_complex_net(self, axis_dtype, input_shape, axis):
+        tf.compat.v1.reset_default_graph()
+        with tf.compat.v1.Session() as sess:
+            param_real = tf.compat.v1.placeholder(np.float32, input_shape, 'param_real')
+            param_imag = tf.compat.v1.placeholder(np.float32, input_shape, 'param_imag')
+
+            complex = tf.raw_ops.Complex(real=param_real, imag=param_imag)
+
+            axis = tf.constant(axis, dtype=axis_dtype)
+
+            result = tf.raw_ops.ExpandDims(input=complex, axis=axis)
+
+            tf.raw_ops.Real(input=result)
+            tf.raw_ops.Imag(input=result)
+
+            tf.compat.v1.global_variables_initializer()
+            tf_net = sess.graph_def
+
+        return tf_net, None
+
+    test_basic = [
+        dict(input_shape=[], axis=0),
+        dict(input_shape=[2, 3], axis=1),
+        dict(input_shape=[2, 3, 4], axis=-1),
+        dict(input_shape=[2, 6, 5], axis=-2),
+    ]
+
+    @pytest.mark.parametrize("axis_dtype", [np.int32, np.int64])
+    @pytest.mark.parametrize("op_args", test_basic)
+    @pytest.mark.nightly
+    @pytest.mark.precommit
+    def test_expand_dims_basic_complex(self, axis_dtype, op_args, ie_device, precision, ir_version, temp_dir, use_legacy_frontend):
+        self._test(*self.create_expand_dims_complex_net(axis_dtype, **op_args),
+                   ie_device, precision, ir_version, temp_dir=temp_dir,
+                   use_legacy_frontend=use_legacy_frontend)
diff --git a/tests/requirements_pytorch b/tests/requirements_pytorch
index b82e0c76409057..0d5ac61903b104 100644
--- a/tests/requirements_pytorch
+++ b/tests/requirements_pytorch
@@ -1,10 +1,14 @@
+# test ovc with NumPy 2.x on Ubuntu 24 with default Python 3.12
+# test against NumPy 1.x with older Python versions
 # optimum still requires numpy<2.0.0
-numpy==1.26.4
+numpy==1.26.4; python_version < "3.12"
+numpy==2.1.1; python_version >= "3.12"
 torch==2.4.1; platform_system != "Darwin" or platform_machine != "x86_64"
-torch==2.2.0; platform_system == "Darwin" and platform_machine == "x86_64"
+torch==2.2.2; platform_system == "Darwin" and platform_machine == "x86_64"
 --extra-index-url https://download.pytorch.org/whl/cpu
 
-torchvision==0.19.1
+torchvision==0.19.1; platform_system != "Darwin" or platform_machine != "x86_64"
+torchvision==0.17.2; platform_system == "Darwin" and platform_machine == "x86_64"
 # transformers 4.45.1 is available
 # but optimum still requires <4.45.0
 transformers==4.44.2
@@ -13,22 +17,22 @@ pytest-html==4.1.1
 pytest-xdist[psutil]==3.6.1
 defusedxml==0.7.1
 
-auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64"
+auto-gptq==0.7.1; platform_system == "Linux" and platform_machine == "x86_64" and python_version < "3.12"
 av==13.0.0
-basicsr==1.4.2
+basicsr==1.4.2; python_version < "3.12"
 datasets==3.0.1
 easyocr==1.7.2
-facexlib==0.3.0
-librosa==0.10.2
-optimum==1.22.0
+facexlib==0.3.0; python_version < "3.12"
+librosa==0.10.2; python_version < "3.12"
+optimum==1.22.0; python_version < "3.12"
 packaging==24.1
 pandas==2.2.3
 protobuf==5.28.2
-pyctcdecode==0.5.0
+pyctcdecode==0.5.0; python_version < "3.12"
 sacremoses==0.1.1
 sentencepiece==0.2.0
 soundfile==0.12.1
-super-image==0.1.7
+super-image==0.1.7; python_version < "3.12"
 timm==1.0.8
 torchaudio==2.4.1
 wheel==0.44.0
@@ -36,7 +40,7 @@ PyYAML==6.0.2
 kornia==0.7.3
 
 # use latest released version once it's available
-git+https://github.com/huggingface/optimum-intel.git@main
+git+https://github.com/huggingface/optimum-intel.git@main; python_version < "3.12"
 # set 'export HF_HUB_ENABLE_HF_TRANSFER=1' to benefits from hf_transfer
 hf_transfer==0.1.8
 
diff --git a/tests/requirements_tensorflow b/tests/requirements_tensorflow
index 9d025397ed1fbd..6042eb8a46a9c3 100644
--- a/tests/requirements_tensorflow
+++ b/tests/requirements_tensorflow
@@ -4,7 +4,8 @@ pytest==7.0.1
 pytest-xdist[psutil]==3.6.1
 pytest-html==4.1.1
 transformers==4.45.1
-tensorflow==2.17.0
+tensorflow==2.17.0; platform_system != "Darwin" or platform_machine != "x86_64"
+tensorflow==2.16.2; platform_system == "Darwin" and platform_machine == "x86_64"
 # tensorflow-text is not available for both Windows and ARM platforms
 tensorflow-text==2.17.0; platform_system == "Linux" and platform_machine == "x86_64"
 tensorflow-hub==0.16.1