From 4ae6a526dc1bf8ef51d19990001cbed0a60c95d9 Mon Sep 17 00:00:00 2001 From: Tymur Boiko Date: Tue, 3 Dec 2024 00:20:52 +0200 Subject: [PATCH] eliminated unneeded offset --- vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp | 4 ++-- .../libs/NvVideoParser/src/NextStartCodeAVX512.cpp | 4 ++-- vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp | 4 ++-- .../libs/NvVideoParser/src/NextStartCodeSSSE3.cpp | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp index d1e1745..3cbbc18 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX2.cpp @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdatai { size_t i = 0; size_t datasize64 = (datasize >> 6) << 6; - if (datasize64 > 64) + if (datasize64 >= 64) { const __m256i v1 = _mm256_set1_epi8(1); __m256i vdata = _mm256_loadu_si256((const __m256i*)pdatain); @@ -23,7 +23,7 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdatai __m256i vdata_alignr16b_init = _mm256_permute2f128_si256(vBfr, vdata, 1 | (2<<4)); __m256i vdata_prev1 = _mm256_alignr_epi8(vdata, vdata_alignr16b_init, 15); __m256i vdata_prev2 = _mm256_alignr_epi8(vdata, vdata_alignr16b_init, 14); - for ( ; i < datasize64 - 64; i += 64) + for ( ; i < datasize64; i += 64) { for (int c = 0; c < 64; c += 32) // this might force compiler to unroll the loop so we might have 2 loads in parallel { diff --git a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX512.cpp b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX512.cpp index 6052d51..0e15600 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX512.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeAVX512.cpp @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdat { size_t i = 0; size_t datasize128 = (datasize >> 7) << 7; - if (datasize128 > 128) + if (datasize128 >= 128) { const __m512i v1 = _mm512_set1_epi8(1); const __m512i v254 = _mm512_set1_epi8(-2); @@ -24,7 +24,7 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdat __m512i vdata_alignr48b_init = _mm512_alignr_epi32(vdata, vBfr, 12); __m512i vdata_prev1 = _mm512_alignr_epi8(vdata, vdata_alignr48b_init, 15); __m512i vdata_prev2 = _mm512_alignr_epi8(vdata, vdata_alignr48b_init, 14); - for ( ; i < datasize128 - 128; i += 128) + for ( ; i < datasize128; i += 128) { for (int c = 0; c < 128; c += 64) // this might force compiler to unroll the loop so we might have 2 loads in parallel { diff --git a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp index 8c6bc4f..df5fd7f 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeNEON.cpp @@ -15,7 +15,7 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdatai { size_t i = 0; size_t datasize32 = (datasize >> 5) << 5; - if (datasize32 > 32) + if (datasize32 >= 32) { const uint8x16_t v0 = vdupq_n_u8(0); const uint8x16_t v1 = vdupq_n_u8(1); @@ -25,7 +25,7 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdatai uint8x16_t vdata_prev2 = vextq_u8(vBfr, vdata, 14); uint8_t idx0n[16] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}; uint8x16_t v015 = vld1q_u8(idx0n); - for ( ; i < datasize32 - 32; i += 32) + for ( ; i < datasize32; i += 32) { for (int c = 0; c < 32; c += 16) { diff --git a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeSSSE3.cpp b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeSSSE3.cpp index f210444..d39f704 100644 --- a/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeSSSE3.cpp +++ b/vk_video_decoder/libs/NvVideoParser/src/NextStartCodeSSSE3.cpp @@ -15,14 +15,14 @@ size_t VulkanVideoDecoder::next_start_code(const uint8_t *pdata { size_t i = 0; size_t datasize32 = (datasize >> 5) << 5; - if (datasize32 > 32) + if (datasize32 >= 32) { const __m128i v1 = _mm_set1_epi8(1); __m128i vdata = _mm_loadu_si128((const __m128i*)pdatain); __m128i vBfr = _mm_set1_epi16(((m_BitBfr << 8) & 0xFF00) | ((m_BitBfr >> 8) & 0xFF)); __m128i vdata_prev1 = _mm_alignr_epi8(vdata, vBfr, 15); __m128i vdata_prev2 = _mm_alignr_epi8(vdata, vBfr, 14); - for ( ; i < datasize32 - 32; i += 32) + for ( ; i < datasize32; i += 32) { for (int c = 0; c < 32; c += 16) {