diff --git a/fuel-vm/src/interpreter/memory.rs b/fuel-vm/src/interpreter/memory.rs index f4d830c23..5e96fe59d 100644 --- a/fuel-vm/src/interpreter/memory.rs +++ b/fuel-vm/src/interpreter/memory.rs @@ -1112,16 +1112,7 @@ fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool { return slices_equal_fallback(a, b); } - let ptr_a = a.as_ptr() as usize; - let offset = ptr_a % CHUNK; - let aligned_len = a.len() - offset; - - if offset > 0 { - // check unaligned bytes - if &a[aligned_len..] != &b[aligned_len..] { - return false; - } - } + let len = a.len(); let mut i = 0; @@ -1133,39 +1124,39 @@ fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool { let mut aggregate_mask_a_b = -1i32; let mut aggregate_mask_c_d = -1i32; - while i + CHUNK <= aligned_len { - let simd_a1 = _mm256_load_si256(a.as_ptr().add(i) as *const _); - let simd_b1 = _mm256_load_si256(b.as_ptr().add(i) as *const _); + while i + CHUNK <= len { + let simd_a1 = _mm256_loadu_si256(a.as_ptr().add(i) as *const _); + let simd_b1 = _mm256_loadu_si256(b.as_ptr().add(i) as *const _); - let simd_a2 = _mm256_load_si256(a.as_ptr().add(i + 32) as *const _); - let simd_b2 = _mm256_load_si256(b.as_ptr().add(i + 32) as *const _); + let simd_a2 = _mm256_loadu_si256(a.as_ptr().add(i + 32) as *const _); + let simd_b2 = _mm256_loadu_si256(b.as_ptr().add(i + 32) as *const _); - let simd_a3 = _mm256_load_si256(a.as_ptr().add(i + 64) as *const _); - let simd_b3 = _mm256_load_si256(b.as_ptr().add(i + 64) as *const _); + let simd_a3 = _mm256_loadu_si256(a.as_ptr().add(i + 64) as *const _); + let simd_b3 = _mm256_loadu_si256(b.as_ptr().add(i + 64) as *const _); - let simd_a4 = _mm256_load_si256(a.as_ptr().add(i + 96) as *const _); - let simd_b4 = _mm256_load_si256(b.as_ptr().add(i + 96) as *const _); + let simd_a4 = _mm256_loadu_si256(a.as_ptr().add(i + 96) as *const _); + let simd_b4 = _mm256_loadu_si256(b.as_ptr().add(i + 96) as *const _); - let simd_a5 = _mm256_load_si256(a.as_ptr().add(i + 128) as *const _); - let simd_b5 = _mm256_load_si256(b.as_ptr().add(i + 128) as *const _); + let simd_a5 = _mm256_loadu_si256(a.as_ptr().add(i + 128) as *const _); + let simd_b5 = _mm256_loadu_si256(b.as_ptr().add(i + 128) as *const _); - let simd_a6 = _mm256_load_si256(a.as_ptr().add(i + 160) as *const _); - let simd_b6 = _mm256_load_si256(b.as_ptr().add(i + 160) as *const _); + let simd_a6 = _mm256_loadu_si256(a.as_ptr().add(i + 160) as *const _); + let simd_b6 = _mm256_loadu_si256(b.as_ptr().add(i + 160) as *const _); - let simd_a7 = _mm256_load_si256(a.as_ptr().add(i + 192) as *const _); - let simd_b7 = _mm256_load_si256(b.as_ptr().add(i + 192) as *const _); + let simd_a7 = _mm256_loadu_si256(a.as_ptr().add(i + 192) as *const _); + let simd_b7 = _mm256_loadu_si256(b.as_ptr().add(i + 192) as *const _); - let simd_a8 = _mm256_load_si256(a.as_ptr().add(i + 224) as *const _); - let simd_b8 = _mm256_load_si256(b.as_ptr().add(i + 224) as *const _); + let simd_a8 = _mm256_loadu_si256(a.as_ptr().add(i + 224) as *const _); + let simd_b8 = _mm256_loadu_si256(b.as_ptr().add(i + 224) as *const _); - let cmp1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a1, simd_b1)); - let cmp2 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a2, simd_b2)); - let cmp3 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a3, simd_b3)); - let cmp4 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a4, simd_b4)); - let cmp5 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a5, simd_b5)); - let cmp6 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a6, simd_b6)); - let cmp7 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a7, simd_b7)); - let cmp8 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a8, simd_b8)); + let cmp1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a1, simd_b1)); + let cmp2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a2, simd_b2)); + let cmp3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a3, simd_b3)); + let cmp4 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a4, simd_b4)); + let cmp5 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a5, simd_b5)); + let cmp6 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a6, simd_b6)); + let cmp7 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a7, simd_b7)); + let cmp8 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a8, simd_b8)); aggregate_mask_a &= cmp1 & cmp2; aggregate_mask_b &= cmp3 & cmp4;