Skip to content

Commit

Permalink
eliminated unneeded offset
Browse files Browse the repository at this point in the history
  • Loading branch information
t-boiko committed Dec 2, 2024
1 parent 8102c9b commit 4ae6a52
Show file tree
Hide file tree
Showing 4 changed files with 8 additions and 8 deletions.
4 changes: 2 additions & 2 deletions vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,15 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::AVX2>(const uint8_t *pdatai
{
size_t i = 0;
size_t datasize64 = (datasize >> 6) << 6;
if (datasize64 > 64)
if (datasize64 >= 64)
{
const __m256i v1 = _mm256_set1_epi8(1);
__m256i vdata = _mm256_loadu_si256((const __m256i*)pdatain);
__m256i vBfr = _mm256_set1_epi16(((m_BitBfr << 8) & 0xFF00) | ((m_BitBfr >> 8) & 0xFF));
__m256i vdata_alignr16b_init = _mm256_permute2f128_si256(vBfr, vdata, 1 | (2<<4));
__m256i vdata_prev1 = _mm256_alignr_epi8(vdata, vdata_alignr16b_init, 15);
__m256i vdata_prev2 = _mm256_alignr_epi8(vdata, vdata_alignr16b_init, 14);
for ( ; i < datasize64 - 64; i += 64)
for ( ; i < datasize64; i += 64)
{
for (int c = 0; c < 64; c += 32) // this might force compiler to unroll the loop so we might have 2 loads in parallel
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::AVX512>(const uint8_t *pdat
{
size_t i = 0;
size_t datasize128 = (datasize >> 7) << 7;
if (datasize128 > 128)
if (datasize128 >= 128)
{
const __m512i v1 = _mm512_set1_epi8(1);
const __m512i v254 = _mm512_set1_epi8(-2);
Expand All @@ -24,7 +24,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::AVX512>(const uint8_t *pdat
__m512i vdata_alignr48b_init = _mm512_alignr_epi32(vdata, vBfr, 12);
__m512i vdata_prev1 = _mm512_alignr_epi8(vdata, vdata_alignr48b_init, 15);
__m512i vdata_prev2 = _mm512_alignr_epi8(vdata, vdata_alignr48b_init, 14);
for ( ; i < datasize128 - 128; i += 128)
for ( ; i < datasize128; i += 128)
{
for (int c = 0; c < 128; c += 64) // this might force compiler to unroll the loop so we might have 2 loads in parallel
{
Expand Down
4 changes: 2 additions & 2 deletions vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::NEON>(const uint8_t *pdatai
{
size_t i = 0;
size_t datasize32 = (datasize >> 5) << 5;
if (datasize32 > 32)
if (datasize32 >= 32)
{
const uint8x16_t v0 = vdupq_n_u8(0);
const uint8x16_t v1 = vdupq_n_u8(1);
Expand All @@ -25,7 +25,7 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::NEON>(const uint8_t *pdatai
uint8x16_t vdata_prev2 = vextq_u8(vBfr, vdata, 14);
uint8_t idx0n[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
uint8x16_t v015 = vld1q_u8(idx0n);
for ( ; i < datasize32 - 32; i += 32)
for ( ; i < datasize32; i += 32)
{
for (int c = 0; c < 32; c += 16)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ size_t VulkanVideoDecoder::next_start_code<SIMD_ISA::SSSE3>(const uint8_t *pdata
{
size_t i = 0;
size_t datasize32 = (datasize >> 5) << 5;
if (datasize32 > 32)
if (datasize32 >= 32)
{
const __m128i v1 = _mm_set1_epi8(1);
__m128i vdata = _mm_loadu_si128((const __m128i*)pdatain);
__m128i vBfr = _mm_set1_epi16(((m_BitBfr << 8) & 0xFF00) | ((m_BitBfr >> 8) & 0xFF));
__m128i vdata_prev1 = _mm_alignr_epi8(vdata, vBfr, 15);
__m128i vdata_prev2 = _mm_alignr_epi8(vdata, vBfr, 14);
for ( ; i < datasize32 - 32; i += 32)
for ( ; i < datasize32; i += 32)
{
for (int c = 0; c < 32; c += 16)
{
Expand Down

0 comments on commit 4ae6a52

Please sign in to comment.