Skip to content

Commit

Permalink
[GPU] Support large N FC optimization for dynamic quantization case (#…
Browse files Browse the repository at this point in the history
…26848)

### Details:
 - Update `fc_bf_tiled_kernel_dyn_quan` for osv_is_yx_osv64_isv2 support

### Tickets:
 - 153232
  • Loading branch information
andrew-k-park authored Oct 3, 2024
1 parent 560db90 commit 18939dc
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 7 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,20 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
uint input_offset = out_b * TILE_IN_B_PITCH + INPUT0_OFFSET;
#endif

#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
const int power_of_two_for_simd = 5;
const int power_of_two_for_osv = 6;
const uint osv64_weight_base = (( (int) (out_f >> power_of_two_for_osv) ) << power_of_two_for_osv);
const uint osv_weight_stride = (INPUT_ELEMENTS_COUNT >> 1);
const uint out_f_offset = (int)((out_f >> power_of_two_for_simd) & 0x1) << power_of_two_for_simd;
// out_f(32) : 0 * osv_weight_stride + 32;
// out_f(64) : 64 * osv_weight_stride + 0;
// out_f(128) : 64 * osv_weight_stride + 32;
// ...
uint weights_offset = osv64_weight_base * osv_weight_stride + out_f_offset;
#else
uint weights_offset = out_f * (INPUT_ELEMENTS_COUNT / 2);
#endif

ACCUMULATOR_VEC_TYPE acc[TILE_B] = { };

Expand Down Expand Up @@ -905,7 +918,11 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(

__local int* char_slm_weight = (__local int*)wei_local_mem;

#if FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE * 2;
#else
uint weights_idx = weights_offset + local_id * SIMD * FILTER_LOAD_ITERS * FILTER_ACTUAL_LOAD_BLOCK_SIZE;
#endif
uint wei_local_idx = local_id * SIMD * FILTER_LOAD_ITERS * (FILTER_LOAD_BLOCK_SIZE/2) + sglid * 2;

// DECOMPRESSION_SCALE_POST_OP SHOULD be enabled for dynamic quantize FC : scale is ACCUMULATOR_VAL_ONE
Expand All @@ -917,6 +934,17 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
// loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
#elif FILTER_LAYOUT_OS_IS_YX_OSV64_ISV2
SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + (FILTER_LOAD_BLOCK_SIZE * SIMD)));
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked_tmp;
dq_wei_unpacked_tmp.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
dq_wei_unpacked_tmp.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
dq_wei_unpacked.s01 = dq_wei_unpacked_tmp.s01;
dq_wei_unpacked.s23 = dq_wei_unpacked_tmp.s45;
dq_wei_unpacked.s45 = dq_wei_unpacked_tmp.s23;
dq_wei_unpacked.s67 = dq_wei_unpacked_tmp.s67;
#else
SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
Expand Down Expand Up @@ -996,11 +1024,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
acc_tmp[1][bi] = imad_SW(acc_tmp[1][bi], input_val, second_weight);
}

#if FILTER_LAYOUT_OS_IYX_OSV16 && TILE_OFM == 2
weights_offset += (TILE_K_OFM_PACKED/2) * SIMD;
#else
weights_offset += TILE_K_OFM_PACKED * SIMD;
#endif
weights_offset += TILE_K_OFM_PACKED * TILE_OFM_PER_OSV_SIZE * SIMD;

#if DECOMPRESSION_SCALE_POST_OP && (TILE_IFM_ELEMENTS_SIZE > DECOMPRESSION_SCALE_GROUP_SIZE)
unroll_for (uint bi = 0; bi < TILE_B; ++bi) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -781,8 +781,7 @@ KernelsData FullyConnected_bf_tiled::GetTunedKernelsDataByIndex(const Params &pa
auto output_f = get_output_aligned_bf_size(fc_params, false).second;

WeightsLayout weights_layout = WeightsLayout::os_iyx_osv16;
// TODO: Update may also be required to fc_bf_tiled_kernel_dyn_quan kernel to support os_is_yx_osv64_isv2 format as needed
if (!should_dynamic_quantize(fc_params) && fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
if (fc_params.compressed && fc_params.inputs[0].GetDType() == Datatype::F16
&& (fc_params.weights.GetLayout() == WeightsLayout::oiyx || fc_params.weights.GetLayout() == WeightsLayout::os_is_yx_osv64_isv2)
&& (fc_params.weights.GetDType() == WeightsType::INT4 || fc_params.weights.GetDType() == WeightsType::UINT4)
&& is_weight_horizontal(fc_params, output_f)) {
Expand Down

0 comments on commit 18939dc

Please sign in to comment.