Skip to content

Commit

Permalink
Merge branch 'master' into use-intrinsics-in-core-convert
Browse files Browse the repository at this point in the history
  • Loading branch information
praasz authored Oct 3, 2024
2 parents 58e7f17 + 1b892bf commit 5f52d43
Show file tree
Hide file tree
Showing 13 changed files with 102 additions and 69 deletions.
39 changes: 14 additions & 25 deletions .github/workflows/job_pytorch_layer_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ on:
description: 'Machine on which the tests would run'
type: string
required: true
shell:
description: "shell to override the default shell settings in the runner's operating system."
type: string
required: true
container:
description: 'JSON to be converted to the value of the "container" configuration for the job'
type: string
Expand All @@ -20,12 +16,15 @@ on:
description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
type: string
required: true
python-version:
description: 'Python version to setup. E.g., "3.11"'
type: string
required: true

permissions: read-all

env:
PIP_CACHE_PATH: /mount/caches/pip/linux
PYTHON_VERSION: '3.11'

jobs:
PyTorch_Layer_Tests:
Expand All @@ -35,7 +34,7 @@ jobs:
container: ${{ fromJSON(inputs.container) }}
defaults:
run:
shell: ${{ inputs.shell }}
shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
env:
DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
OPENVINO_REPO: ${{ github.workspace }}/openvino
Expand All @@ -55,12 +54,6 @@ jobs:
name: openvino_tests
path: ${{ env.INSTALL_TEST_DIR }}

- name: Download OpenVINO tokenizers extension
uses: actions/download-artifact@fa0a91b85d4f404e444e00e005971372dc801d16 # v4.1.8
with:
name: openvino_tokenizers_wheel
path: ${{ env.INSTALL_DIR }}

# Needed as ${{ github.workspace }} is not working correctly when using Docker
- name: Setup Variables
if: runner.os != 'Windows'
Expand Down Expand Up @@ -98,10 +91,10 @@ jobs:
sparse-checkout-cone-mode: false
path: 'openvino'

- name: Setup Python ${{ env.PYTHON_VERSION }}
- name: Setup Python ${{ inputs.python-version }}
uses: ./openvino/.github/actions/setup_python
with:
version: ${{ env.PYTHON_VERSION }}
version: ${{ inputs.python-version }}
pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
should-setup-pip-paths: ${{ runner.os == 'Linux' }}
self-hosted-runner: ${{ runner.os == 'Linux' }}
Expand All @@ -112,43 +105,39 @@ jobs:
# Install the core OV wheel
python3 -m pip install ${INSTALL_DIR}/tools/openvino-*.whl
# Install the core OV Tokenizers wheel
python3 -m pip install ${INSTALL_DIR}/openvino_tokenizers-*.whl
- name: Install OpenVINO Python wheels (Windows)
if: runner.os == 'Windows'
run: |
# Find and install the core OV wheel
$ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }}\tools -Filter openvino-*.whl | % { $_.FullName }
python3 -m pip install "$ovCoreWheelPath"
# Find and install the core OV Tokenizers wheel
$ovCoreWheelPath=Get-ChildItem -Path ${{ env.INSTALL_DIR }} -Filter openvino_tokenizers-*.whl | % { $_.FullName }
python3 -m pip install "$ovCoreWheelPath"
- name: Install Pytorch Layer tests dependencies
run: |
# pytorch test requirements
python3 -m pip install -r ${{ env.INSTALL_TEST_DIR }}/requirements_pytorch
- name: PyTorch Layer Tests
if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287, 142196
run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
# due to CVS-152795, parallel run is not possible on Windows
run: python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
env:
TEST_DEVICE: CPU
TEST_PRECISION: FP32
PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}

- name: PyTorch torch.export Layer Tests
if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' }} # Ticket: 126287
if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
run: |
python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -n logical -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests ${PARALLEL} -m precommit_torch_export --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
env:
TEST_DEVICE: CPU
TEST_PRECISION: FP32
PYTORCH_TRACING_MODE: EXPORT
PARALLEL: ${{ runner.os == 'Windows' && ' ' || '-n logical'}}

- name: PyTorch torch.compile TORCHFX Layer Tests
if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' }} # Ticket: 126287
if: ${{ fromJSON(inputs.affected-components).PyTorch_FE.test && runner.os != 'macOS' && runner.arch != 'ARM64' && runner.os != 'Windows' }} # Ticket: 126287
run: |
python3 -m pytest ${{ env.LAYER_TESTS_INSTALL_DIR }}/pytorch_tests -m precommit_fx_backend --junitxml=${{ env.INSTALL_TEST_DIR }}/TEST-pytorch.xml
env:
Expand Down
15 changes: 7 additions & 8 deletions .github/workflows/job_tensorflow_layer_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,6 @@ on:
description: 'Machine on which the tests would run'
type: string
required: true
shell:
description: "shell to override the default shell settings in the runner's operating system."
type: string
required: true
container:
description: 'JSON to be converted to the value of the "container" configuration for the job'
type: string
Expand All @@ -20,12 +16,15 @@ on:
description: 'Components that are affected by changes in the commit defined by the Smart CI Action'
type: string
required: true
python-version:
description: 'Python version to setup. E.g., "3.11"'
type: string
required: true

permissions: read-all

env:
PIP_CACHE_PATH: /mount/caches/pip/linux
PYTHON_VERSION: '3.11'

jobs:
TensorFlow_Layer_Tests:
Expand All @@ -35,7 +34,7 @@ jobs:
container: ${{ fromJSON(inputs.container) }}
defaults:
run:
shell: ${{ inputs.shell }}
shell: ${{ contains(inputs.runner, 'win') && 'pwsh' || 'bash' }}
env:
DEBIAN_FRONTEND: noninteractive # to prevent apt-get from waiting user input
OPENVINO_REPO: ${{ github.workspace }}/openvino
Expand Down Expand Up @@ -98,10 +97,10 @@ jobs:
sparse-checkout-cone-mode: false
path: 'openvino'

- name: Setup Python ${{ env.PYTHON_VERSION }}
- name: Setup Python ${{ inputs.python-version }}
uses: ./openvino/.github/actions/setup_python
with:
version: ${{ env.PYTHON_VERSION }}
version: ${{ inputs.python-version }}
pip-cache-path: ${{ runner.os == 'Linux' && env.PIP_CACHE_PATH || '' }}
should-setup-pip-paths: ${{ runner.os == 'Linux' }}
self-hosted-runner: ${{ runner.os == 'Linux' }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/linux_arm64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -173,19 +173,19 @@ jobs:
uses: ./.github/workflows/job_tensorflow_layer_tests.yml
with:
runner: 'aks-linux-16-cores-arm'
shell: bash
container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

Pytorch_Layer_Tests:
name: Pytorch Layer Tests
needs: [ Build, Docker, Smart_CI, Openvino_tokenizers ]
needs: [ Build, Docker, Smart_CI ]
uses: ./.github/workflows/job_pytorch_layer_tests.yml
with:
runner: 'aks-linux-16-cores-arm'
shell: bash
container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_20_04_arm64 }}", "volumes": ["/mount:/mount"]}'
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

CPU_Functional_Tests:
name: CPU functional tests
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/mac.yml
Original file line number Diff line number Diff line change
Expand Up @@ -276,17 +276,17 @@ jobs:
uses: ./.github/workflows/job_tensorflow_layer_tests.yml
with:
runner: 'macos-13'
shell: bash
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

Pytorch_Layer_Tests:
name: Pytorch Layer Tests
needs: [ Build, Smart_CI, Openvino_tokenizers ]
needs: [ Build, Smart_CI ]
uses: ./.github/workflows/job_pytorch_layer_tests.yml
with:
runner: 'macos-13'
shell: bash
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

CPU_Functional_Tests:
name: CPU functional tests
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/mac_arm64.yml
Original file line number Diff line number Diff line change
Expand Up @@ -275,17 +275,17 @@ jobs:
uses: ./.github/workflows/job_tensorflow_layer_tests.yml
with:
runner: 'macos-13-xlarge'
shell: bash
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

Pytorch_Layer_Tests:
name: Pytorch Layer Tests
needs: [ Build, Smart_CI, Openvino_tokenizers ]
needs: [ Build, Smart_CI ]
uses: ./.github/workflows/job_pytorch_layer_tests.yml
with:
runner: 'macos-13-xlarge'
shell: bash
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

CPU_Functional_Tests:
name: CPU functional tests
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/ubuntu_22.yml
Original file line number Diff line number Diff line change
Expand Up @@ -305,19 +305,19 @@ jobs:
uses: ./.github/workflows/job_tensorflow_layer_tests.yml
with:
runner: 'aks-linux-4-cores-16gb'
shell: bash
container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

Pytorch_Layer_Tests:
name: Pytorch Layer Tests
needs: [ Docker, Build, Smart_CI, Openvino_tokenizers ]
needs: [ Docker, Build, Smart_CI ]
uses: ./.github/workflows/job_pytorch_layer_tests.yml
with:
runner: 'aks-linux-4-cores-16gb'
shell: bash
container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_22_04_x64 }}", "volumes": ["/mount:/mount"]}'
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

CPU_Functional_Tests:
name: CPU functional tests
Expand Down
10 changes: 10 additions & 0 deletions .github/workflows/ubuntu_24.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,16 @@ jobs:
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.12'

Pytorch_Layer_Tests:
name: Pytorch Layer Tests
needs: [ Docker, Build, Smart_CI ]
uses: ./.github/workflows/job_pytorch_layer_tests.yml
with:
runner: 'aks-linux-4-cores-16gb'
container: '{"image": "${{ fromJSON(needs.docker.outputs.images).ov_test.ubuntu_24_04_x64 }}", "volumes": ["/mount:/mount"]}'
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.12'

Overall_Status:
name: ci/gha_overall_status_ubuntu_24
needs: [Smart_CI, Build, Debian_Packages, Samples, Python_Unit_Tests]
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/windows_vs2019_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -404,17 +404,17 @@ jobs:
uses: ./.github/workflows/job_tensorflow_layer_tests.yml
with:
runner: 'aks-win-8-cores-16gb'
shell: pwsh
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

Pytorch_Layer_Tests:
name: Pytorch Layer Tests
needs: [ Build, Smart_CI, Openvino_tokenizers ]
needs: [ Build, Smart_CI ]
uses: ./.github/workflows/job_pytorch_layer_tests.yml
with:
runner: 'aks-win-8-cores-16gb'
shell: pwsh
affected-components: ${{ needs.smart_ci.outputs.affected_components }}
python-version: '3.11'

CXX_Unit_Tests:
name: C++ unit tests
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
#endif

#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
const int power_of_two_for_simd = 5;
const int power_of_two_for_osv = 6;
const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv);
const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1);
const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd;
// out_f(32) : 0 * osv_weight_stride + 32;
// out_f(64) : 64 * osv_weight_stride + 0;
// out_f(128) : 64 * osv_weight_stride + 32;
// ...
uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset;
#else
uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2);
#endif

ACCUMULATOR_VEC_TYPE acc[TILE_B] = { };

Expand Down Expand Up @@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

__local int* char_slm_weight = (__local int*)wei_local_mem;

#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2;
#else
uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE;
#endif
uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2;

// DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE
Expand All @@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
// loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
#elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD)));
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp;
dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01;
dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45;
dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23;
dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67;
#else
SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
Expand Down Expand Up @@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight);
}

#if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2
weights_offset += (TILE_K_OFM_PACKED/2) * SIMD;
#else
weights_offset += TILE_K_OFM_PACKED * SIMD;
#endif
weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -534,6 +534,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
size_t tile_k_ofm_packed = tile_k_ofm;
size_t quantize_grp_size = get_dynamic_quantize_group_size(params);

bool add_decompress_scale_post_op = false;
WeightsType weights_dt = params.weights.GetDType();
if (weights_dt == WeightsType::UINT4 || weights_dt == WeightsType::INT4) {
tile_k_ofm_packed /= 2;
Expand All @@ -542,7 +543,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
const size_t scale_group_size = params.weights.IFM().v / params.decompression_scale.Feature().v;
// Do not use SCALE_POST_OP for SLM kernel, since it demonstrates worse performance
if (scale_group_size % simd == 0 && !dispatchData.use_slm)
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
add_decompress_scale_post_op = true;
}
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
Expand Down Expand Up @@ -619,6 +620,8 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
} else {
if (add_decompress_scale_post_op)
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size));
}
Expand Down Expand Up @@ -781,8 +784,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
auto output_f = get_output_aligned_bf_size(fc_params, false).second;

WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
// TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed
if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
&& (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
&& (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
&& is_weight_horizontal(fc_params, output_f)) {
Expand Down
Loading

0 comments on commit 5f52d43

Please sign in to comment.