Skip to content

Commit

Permalink
Optimize crc32 & crc32c on NVIDIA Grace
Browse files Browse the repository at this point in the history
Summary:
This pull request adds hardware accelerated routines for CRC32 and CRC32C for Arm AARCH64 CPUs. The changes here have been tested on NVIDIA Grace.
In detail, it contains routines for:

- Computing CRC32 and CRC32C hashes on dataset using the CRC intrinsics. On Grace/Neoverse V2, this can process 8 bytes/cycle.
- A vectorized implementation of the `gf_multiply_crc32c_hw` and `gf_multiply_crc32_hw` functions used in routines to merge partial CRC checksums. These functions are more or less a 1:1 translation of the x86 vectorized routines.
- I've introduced feature flags for AES, and SHA extensions for Arm CPUs. The feature checks for the vectorized functions are a bit more messy than on x86 because CPUs can implement a subset of these extensions.

This should resolve issue facebook/folly#2027.

X-link: facebook/folly#2204

Reviewed By: yfeldblum

Differential Revision: D57456858

Pulled By: r1mikey

fbshipit-source-id: 8ff7be6c7b03bff8cf6df46a76a9a2b5ad8555ef
  • Loading branch information
krenzland authored and facebook-github-bot committed Sep 26, 2024
1 parent 0994214 commit 927c73c
Show file tree
Hide file tree
Showing 9 changed files with 260 additions and 9 deletions.
16 changes: 16 additions & 0 deletions third-party/folly/src/folly/Portability.h
Original file line number Diff line number Diff line change
Expand Up @@ -377,6 +377,22 @@ constexpr auto kHasWeakSymbols = false;
#endif
#endif

#ifndef FOLLY_ARM_FEATURE_AES
#ifdef __ARM_FEATURE_AES
#define FOLLY_ARM_FEATURE_AES 1
#else
#define FOLLY_ARM_FEATURE_AES 0
#endif
#endif

#ifndef FOLLY_ARM_FEATURE_SHA2
#ifdef __ARM_FEATURE_SHA2
#define FOLLY_ARM_FEATURE_SHA2 1
#else
#define FOLLY_ARM_FEATURE_SHA2 0
#endif
#endif

// RTTI may not be enabled for this compilation unit.
#if defined(__GXX_RTTI) || defined(__cpp_rtti) || \
(defined(_MSC_VER) && defined(_CPPRTTI))
Expand Down
66 changes: 66 additions & 0 deletions third-party/folly/src/folly/external/nvidia/hash/Checksum.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#if defined(__aarch64__)

#include <cstring>
#include <cstddef>

#include <folly/Portability.h>

#if FOLLY_ARM_FEATURE_CRC32

#include <arm_acle.h>

namespace folly::detail {

uint32_t crc32_hw(const uint8_t* buf, size_t len, uint32_t crc) {
while (len >= 8) {
uint64_t val = 0;
std::memcpy(&val, buf, 8);
crc = __crc32d(crc, val);
len -= 8;
buf += 8;
}

if (len % 8 >= 4) {
uint32_t val = 0;
std::memcpy(&val, buf, 4);
crc = __crc32w(crc, val);
len -= 4;
buf += 4;
}

if (len % 4 >= 2) {
uint16_t val = 0;
std::memcpy(&val, buf, 2);
crc = __crc32h(crc, val);
len -= 2;
buf += 2;
}

if (len % 2 >= 1) {
crc = __crc32b(crc, *buf);
}

return crc;
}

} // namespace folly::detail

#endif // FOLLY_ARM_FEATURE_CRC32

#endif // __aarch64__
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
/*
* Copyright (c) Meta Platforms, Inc. and affiliates.
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <folly/Portability.h>

#if FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \
FOLLY_ARM_FEATURE_SHA2

#include <arm_acle.h>
#include <arm_neon.h>

namespace folly::detail {

inline uint32_t gf_multiply_crc32c_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1);

const poly128_t res0 = vmull_p64(crc2, crc1);
const uint64x2_t res1 =
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count));

// Use hardware crc32c to do reduction from 64 -> 32 bytes
const uint64_t res2 = vgetq_lane_u64(res1, 0);
const uint32_t res3 = __crc32cw(0, res2);
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1);

return res3 ^ res4;
}

inline uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1);

const poly128_t res0 = vmull_p64(crc2, crc1);
const uint64x2_t res1 =
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count));

// Use hardware crc32 to do reduction from 64 -> 32 bytes
const uint64_t res2 = vgetq_lane_u64(res1, 0);
const uint32_t res3 = __crc32w(0, res2);
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1);

return res3 ^ res4;
}

} // namespace folly

#endif // FOLLY_ARM_FEATURE_CRC32
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
/*
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#if defined(__aarch64__)

#include <cstring>

#include <folly/Portability.h>

#if FOLLY_ARM_FEATURE_CRC32

#include <arm_acle.h>

namespace folly::detail {

uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
while (len >= 8) {
uint64_t val = 0;
std::memcpy(&val, buf, 8);
crc = __crc32cd(crc, val);
len -= 8;
buf += 8;
}

if (len >= 4) {
uint32_t val = 0;
std::memcpy(&val, buf, 4);
crc = __crc32cw(crc, val);
len -= 4;
buf += 4;
}

if (len >= 2) {
uint16_t val = 0;
std::memcpy(&val, buf, 2);
crc = __crc32ch(crc, val);
len -= 2;
buf += 2;
}

if (len >= 1) {
crc = __crc32cb(crc, *buf);
}

return crc;
}

} // namespace folly::detail

#endif // FOLLY_ARM_FEATURE_CRC32

#endif // __aarch64__
30 changes: 29 additions & 1 deletion third-party/folly/src/folly/hash/Checksum.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,10 @@ uint32_t crc32_hw(
}

bool crc32c_hw_supported() {
return crc32c_hw_supported_sse42();
}

bool crc32c_hw_supported_sse42() {
static folly::CpuId id;
return id.sse42();
}
Expand All @@ -86,7 +90,27 @@ bool crc32_hw_supported() {
return id.sse42();
}

#else
#elif FOLLY_ARM_FEATURE_CRC32

// crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp

bool crc32c_hw_supported() {
return true;
}

bool crc32c_hw_supported_sse42() {
return false;
}

bool crc32c_hw_supported_avx512() {
return false;
}

bool crc32_hw_supported() {
return true;
}

#else // FOLLY_ARM_FEATURE_CRC32

uint32_t crc32_hw(
const uint8_t* /* data */,
Expand All @@ -99,6 +123,10 @@ bool crc32c_hw_supported() {
return false;
}

bool crc32c_hw_supported_sse42() {
return false;
}

bool crc32c_hw_supported_avx512() {
return false;
}
Expand Down
6 changes: 6 additions & 0 deletions third-party/folly/src/folly/hash/detail/ChecksumDetail.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,12 @@ uint32_t crc32c_hw(
* Check whether a SSE4.2 hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported_sse42();

/**
* Check whether a hardware-accelerated CRC-32C implementation is
* supported on the current CPU.
*/
bool crc32c_hw_supported();

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <utility>

#include <folly/Bits.h>
#include <folly/external/nvidia/hash/detail/Crc32cCombineDetail.h>
#include <folly/hash/detail/ChecksumDetail.h>

namespace folly {
Expand Down Expand Up @@ -105,6 +106,11 @@ static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) {
return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(res3, res1), 4));
}

#elif FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \
FOLLY_ARM_FEATURE_SHA2

// gf_multiply_crc32c_hw and fg_multiply_crc32_hw are defined in
// external/nvidia/hash/detail/Crc32cCombineDetail-inl.h
#else

static uint32_t gf_multiply_crc32c_hw(uint64_t, uint64_t, uint32_t) {
Expand All @@ -114,7 +120,7 @@ static uint32_t gf_multiply_crc32_hw(uint64_t, uint64_t, uint32_t) {
return 0;
}

#endif
#endif // FOLLY_SSE_PREREQ(4, 2)

static constexpr uint32_t crc32c_m = 0x82f63b78;
static constexpr uint32_t crc32_m = 0xedb88320;
Expand Down
6 changes: 5 additions & 1 deletion third-party/folly/src/folly/hash/detail/Crc32cDetail.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -286,14 +286,18 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) {
return (uint32_t)crc0;
}

#elif FOLLY_ARM_FEATURE_CRC32

// crc32c_hw is defined in external/nvidia/hash/detail/Crc32cDetail.cpp

#else

uint32_t crc32c_hw(
const uint8_t* /* buf */, size_t /* len */, uint32_t /* crc */) {
throw std::runtime_error("crc32_hw is not implemented on this platform");
}

#endif
#endif // !defined(FOLLY_ARM_FEATURE_CRC32)

} // namespace detail
} // namespace folly
12 changes: 6 additions & 6 deletions third-party/folly/src/folly/hash/test/ChecksumTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ TEST(Checksum, crc32cContinuationHardware) {
}

TEST(Checksum, crc32cHardwareSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32C(folly::detail::sse_crc32c_v8s3x3);
} else {
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
Expand All @@ -156,7 +156,7 @@ TEST(Checksum, crc32cHardwareSse42) {
}

TEST(Checksum, crc32cHardwareEqSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
for (size_t i = 0; i < 1000; i++) {
auto sw = folly::detail::crc32c_sw(buffer, i, 0);
auto hw = folly::detail::sse_crc32c_v8s3x3(buffer, i, 0);
Expand All @@ -169,7 +169,7 @@ TEST(Checksum, crc32cHardwareEqSse42) {
}

TEST(Checksum, crc32cContinuationHardwareSse42) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3);
} else {
LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests"
Expand Down Expand Up @@ -220,7 +220,7 @@ TEST(Checksum, crc32clargeBuffers) {

constexpr uint32_t kCrc = 2860399007;

if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32c_hw_supported_sse42()) {
auto crcSse42 = folly::detail::sse_crc32c_v8s3x3(bufp, kLargeBufSz, ~0);
ASSERT_EQ(kCrc, crcSse42);
auto crcHw = folly::detail::crc32c_hw(bufp, kLargeBufSz, ~0);
Expand All @@ -242,7 +242,7 @@ TEST(Checksum, crc32cContinuationAutodetect) {
}

TEST(Checksum, crc32) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32_hw_supported()) {
// Just check that sw and hw match
for (auto expected : expectedResults) {
uint32_t sw_res =
Expand All @@ -258,7 +258,7 @@ TEST(Checksum, crc32) {
}

TEST(Checksum, crc32Continuation) {
if (folly::detail::crc32c_hw_supported()) {
if (folly::detail::crc32_hw_supported()) {
// Just check that sw and hw match
for (auto expected : expectedResults) {
auto halflen = expected.length / 2;
Expand Down

0 comments on commit 927c73c

Please sign in to comment.