-
Notifications
You must be signed in to change notification settings - Fork 3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Optimize crc32 & crc32c on NVIDIA Grace
Summary: This pull request adds hardware accelerated routines for CRC32 and CRC32C for Arm AARCH64 CPUs. The changes here have been tested on NVIDIA Grace. In detail, it contains routines for: - Computing CRC32 and CRC32C hashes on dataset using the CRC intrinsics. On Grace/Neoverse V2, this can process 8 bytes/cycle. - A vectorized implementation of the `gf_multiply_crc32c_hw` and `gf_multiply_crc32_hw` functions used in routines to merge partial CRC checksums. These functions are more or less a 1:1 translation of the x86 vectorized routines. - I've introduced feature flags for AES, and SHA extensions for Arm CPUs. The feature checks for the vectorized functions are a bit more messy than on x86 because CPUs can implement a subset of these extensions. This should resolve issue facebook/folly#2027. X-link: facebook/folly#2204 Reviewed By: yfeldblum Differential Revision: D57456858 Pulled By: r1mikey fbshipit-source-id: 8ff7be6c7b03bff8cf6df46a76a9a2b5ad8555ef
- Loading branch information
1 parent
0994214
commit 927c73c
Showing
9 changed files
with
260 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
66 changes: 66 additions & 0 deletions
66
third-party/folly/src/folly/external/nvidia/hash/Checksum.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
/* | ||
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#if defined(__aarch64__) | ||
|
||
#include <cstring> | ||
#include <cstddef> | ||
|
||
#include <folly/Portability.h> | ||
|
||
#if FOLLY_ARM_FEATURE_CRC32 | ||
|
||
#include <arm_acle.h> | ||
|
||
namespace folly::detail { | ||
|
||
uint32_t crc32_hw(const uint8_t* buf, size_t len, uint32_t crc) { | ||
while (len >= 8) { | ||
uint64_t val = 0; | ||
std::memcpy(&val, buf, 8); | ||
crc = __crc32d(crc, val); | ||
len -= 8; | ||
buf += 8; | ||
} | ||
|
||
if (len % 8 >= 4) { | ||
uint32_t val = 0; | ||
std::memcpy(&val, buf, 4); | ||
crc = __crc32w(crc, val); | ||
len -= 4; | ||
buf += 4; | ||
} | ||
|
||
if (len % 4 >= 2) { | ||
uint16_t val = 0; | ||
std::memcpy(&val, buf, 2); | ||
crc = __crc32h(crc, val); | ||
len -= 2; | ||
buf += 2; | ||
} | ||
|
||
if (len % 2 >= 1) { | ||
crc = __crc32b(crc, *buf); | ||
} | ||
|
||
return crc; | ||
} | ||
|
||
} // namespace folly::detail | ||
|
||
#endif // FOLLY_ARM_FEATURE_CRC32 | ||
|
||
#endif // __aarch64__ |
60 changes: 60 additions & 0 deletions
60
third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cCombineDetail.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
/* | ||
* Copyright (c) Meta Platforms, Inc. and affiliates. | ||
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <folly/Portability.h> | ||
|
||
#if FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \ | ||
FOLLY_ARM_FEATURE_SHA2 | ||
|
||
#include <arm_acle.h> | ||
#include <arm_neon.h> | ||
|
||
namespace folly::detail { | ||
|
||
inline uint32_t gf_multiply_crc32c_hw(uint64_t crc1, uint64_t crc2, uint32_t) { | ||
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1); | ||
|
||
const poly128_t res0 = vmull_p64(crc2, crc1); | ||
const uint64x2_t res1 = | ||
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count)); | ||
|
||
// Use hardware crc32c to do reduction from 64 -> 32 bytes | ||
const uint64_t res2 = vgetq_lane_u64(res1, 0); | ||
const uint32_t res3 = __crc32cw(0, res2); | ||
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1); | ||
|
||
return res3 ^ res4; | ||
} | ||
|
||
inline uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) { | ||
const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1); | ||
|
||
const poly128_t res0 = vmull_p64(crc2, crc1); | ||
const uint64x2_t res1 = | ||
vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count)); | ||
|
||
// Use hardware crc32 to do reduction from 64 -> 32 bytes | ||
const uint64_t res2 = vgetq_lane_u64(res1, 0); | ||
const uint32_t res3 = __crc32w(0, res2); | ||
const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1); | ||
|
||
return res3 ^ res4; | ||
} | ||
|
||
} // namespace folly | ||
|
||
#endif // FOLLY_ARM_FEATURE_CRC32 |
65 changes: 65 additions & 0 deletions
65
third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cDetail.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
/* | ||
* Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#if defined(__aarch64__) | ||
|
||
#include <cstring> | ||
|
||
#include <folly/Portability.h> | ||
|
||
#if FOLLY_ARM_FEATURE_CRC32 | ||
|
||
#include <arm_acle.h> | ||
|
||
namespace folly::detail { | ||
|
||
uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) { | ||
while (len >= 8) { | ||
uint64_t val = 0; | ||
std::memcpy(&val, buf, 8); | ||
crc = __crc32cd(crc, val); | ||
len -= 8; | ||
buf += 8; | ||
} | ||
|
||
if (len >= 4) { | ||
uint32_t val = 0; | ||
std::memcpy(&val, buf, 4); | ||
crc = __crc32cw(crc, val); | ||
len -= 4; | ||
buf += 4; | ||
} | ||
|
||
if (len >= 2) { | ||
uint16_t val = 0; | ||
std::memcpy(&val, buf, 2); | ||
crc = __crc32ch(crc, val); | ||
len -= 2; | ||
buf += 2; | ||
} | ||
|
||
if (len >= 1) { | ||
crc = __crc32cb(crc, *buf); | ||
} | ||
|
||
return crc; | ||
} | ||
|
||
} // namespace folly::detail | ||
|
||
#endif // FOLLY_ARM_FEATURE_CRC32 | ||
|
||
#endif // __aarch64__ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters