Skip to content

Commit

Permalink
test: use epi32 instead of epi8?
Browse files Browse the repository at this point in the history
  • Loading branch information
rymnc committed Jan 1, 2025
1 parent 042aca3 commit 123aab9
Showing 1 changed file with 26 additions and 35 deletions.
61 changes: 26 additions & 35 deletions fuel-vm/src/interpreter/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1112,16 +1112,7 @@ fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool {
return slices_equal_fallback(a, b);
}

let ptr_a = a.as_ptr() as usize;
let offset = ptr_a % CHUNK;
let aligned_len = a.len() - offset;

if offset > 0 {
// check unaligned bytes
if &a[aligned_len..] != &b[aligned_len..] {
return false;
}
}
let len = a.len();

let mut i = 0;

Expand All @@ -1133,39 +1124,39 @@ fn slices_equal_avx2(a: &[u8], b: &[u8]) -> bool {
let mut aggregate_mask_a_b = -1i32;
let mut aggregate_mask_c_d = -1i32;

while i + CHUNK <= aligned_len {
let simd_a1 = _mm256_load_si256(a.as_ptr().add(i) as *const _);
let simd_b1 = _mm256_load_si256(b.as_ptr().add(i) as *const _);
while i + CHUNK <= len {
let simd_a1 = _mm256_loadu_si256(a.as_ptr().add(i) as *const _);
let simd_b1 = _mm256_loadu_si256(b.as_ptr().add(i) as *const _);

let simd_a2 = _mm256_load_si256(a.as_ptr().add(i + 32) as *const _);
let simd_b2 = _mm256_load_si256(b.as_ptr().add(i + 32) as *const _);
let simd_a2 = _mm256_loadu_si256(a.as_ptr().add(i + 32) as *const _);
let simd_b2 = _mm256_loadu_si256(b.as_ptr().add(i + 32) as *const _);

let simd_a3 = _mm256_load_si256(a.as_ptr().add(i + 64) as *const _);
let simd_b3 = _mm256_load_si256(b.as_ptr().add(i + 64) as *const _);
let simd_a3 = _mm256_loadu_si256(a.as_ptr().add(i + 64) as *const _);
let simd_b3 = _mm256_loadu_si256(b.as_ptr().add(i + 64) as *const _);

let simd_a4 = _mm256_load_si256(a.as_ptr().add(i + 96) as *const _);
let simd_b4 = _mm256_load_si256(b.as_ptr().add(i + 96) as *const _);
let simd_a4 = _mm256_loadu_si256(a.as_ptr().add(i + 96) as *const _);
let simd_b4 = _mm256_loadu_si256(b.as_ptr().add(i + 96) as *const _);

let simd_a5 = _mm256_load_si256(a.as_ptr().add(i + 128) as *const _);
let simd_b5 = _mm256_load_si256(b.as_ptr().add(i + 128) as *const _);
let simd_a5 = _mm256_loadu_si256(a.as_ptr().add(i + 128) as *const _);
let simd_b5 = _mm256_loadu_si256(b.as_ptr().add(i + 128) as *const _);

let simd_a6 = _mm256_load_si256(a.as_ptr().add(i + 160) as *const _);
let simd_b6 = _mm256_load_si256(b.as_ptr().add(i + 160) as *const _);
let simd_a6 = _mm256_loadu_si256(a.as_ptr().add(i + 160) as *const _);
let simd_b6 = _mm256_loadu_si256(b.as_ptr().add(i + 160) as *const _);

let simd_a7 = _mm256_load_si256(a.as_ptr().add(i + 192) as *const _);
let simd_b7 = _mm256_load_si256(b.as_ptr().add(i + 192) as *const _);
let simd_a7 = _mm256_loadu_si256(a.as_ptr().add(i + 192) as *const _);
let simd_b7 = _mm256_loadu_si256(b.as_ptr().add(i + 192) as *const _);

let simd_a8 = _mm256_load_si256(a.as_ptr().add(i + 224) as *const _);
let simd_b8 = _mm256_load_si256(b.as_ptr().add(i + 224) as *const _);
let simd_a8 = _mm256_loadu_si256(a.as_ptr().add(i + 224) as *const _);
let simd_b8 = _mm256_loadu_si256(b.as_ptr().add(i + 224) as *const _);

let cmp1 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a1, simd_b1));
let cmp2 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a2, simd_b2));
let cmp3 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a3, simd_b3));
let cmp4 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a4, simd_b4));
let cmp5 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a5, simd_b5));
let cmp6 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a6, simd_b6));
let cmp7 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a7, simd_b7));
let cmp8 = _mm256_movemask_epi8(_mm256_cmpeq_epi8(simd_a8, simd_b8));
let cmp1 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a1, simd_b1));
let cmp2 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a2, simd_b2));
let cmp3 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a3, simd_b3));
let cmp4 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a4, simd_b4));
let cmp5 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a5, simd_b5));
let cmp6 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a6, simd_b6));
let cmp7 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a7, simd_b7));
let cmp8 = _mm256_movemask_epi8(_mm256_cmpeq_epi32(simd_a8, simd_b8));

aggregate_mask_a &= cmp1 & cmp2;
aggregate_mask_b &= cmp3 & cmp4;
Expand Down

0 comments on commit 123aab9

Please sign in to comment.