diff --git a/third-party/folly/src/folly/Portability.h b/third-party/folly/src/folly/Portability.h index e823f27f4d5ff..0bc1dca0ed6c2 100644 --- a/third-party/folly/src/folly/Portability.h +++ b/third-party/folly/src/folly/Portability.h @@ -377,6 +377,22 @@ constexpr auto kHasWeakSymbols = false; #endif #endif +#ifndef FOLLY_ARM_FEATURE_AES +#ifdef __ARM_FEATURE_AES +#define FOLLY_ARM_FEATURE_AES 1 +#else +#define FOLLY_ARM_FEATURE_AES 0 +#endif +#endif + +#ifndef FOLLY_ARM_FEATURE_SHA2 +#ifdef __ARM_FEATURE_SHA2 +#define FOLLY_ARM_FEATURE_SHA2 1 +#else +#define FOLLY_ARM_FEATURE_SHA2 0 +#endif +#endif + // RTTI may not be enabled for this compilation unit. #if defined(__GXX_RTTI) || defined(__cpp_rtti) || \ (defined(_MSC_VER) && defined(_CPPRTTI)) diff --git a/third-party/folly/src/folly/external/nvidia/hash/Checksum.cpp b/third-party/folly/src/folly/external/nvidia/hash/Checksum.cpp new file mode 100644 index 0000000000000..8bd326e5d546d --- /dev/null +++ b/third-party/folly/src/folly/external/nvidia/hash/Checksum.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(__aarch64__) + +#include +#include + +#include + +#if FOLLY_ARM_FEATURE_CRC32 + +#include + +namespace folly::detail { + +uint32_t crc32_hw(const uint8_t* buf, size_t len, uint32_t crc) { + while (len >= 8) { + uint64_t val = 0; + std::memcpy(&val, buf, 8); + crc = __crc32d(crc, val); + len -= 8; + buf += 8; + } + + if (len % 8 >= 4) { + uint32_t val = 0; + std::memcpy(&val, buf, 4); + crc = __crc32w(crc, val); + len -= 4; + buf += 4; + } + + if (len % 4 >= 2) { + uint16_t val = 0; + std::memcpy(&val, buf, 2); + crc = __crc32h(crc, val); + len -= 2; + buf += 2; + } + + if (len % 2 >= 1) { + crc = __crc32b(crc, *buf); + } + + return crc; +} + +} // namespace folly::detail + +#endif // FOLLY_ARM_FEATURE_CRC32 + +#endif // __aarch64__ diff --git a/third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cCombineDetail.h b/third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cCombineDetail.h new file mode 100644 index 0000000000000..e7cb9835df924 --- /dev/null +++ b/third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cCombineDetail.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#if FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \ + FOLLY_ARM_FEATURE_SHA2 + +#include +#include + +namespace folly::detail { + +inline uint32_t gf_multiply_crc32c_hw(uint64_t crc1, uint64_t crc2, uint32_t) { + const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1); + + const poly128_t res0 = vmull_p64(crc2, crc1); + const uint64x2_t res1 = + vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count)); + + // Use hardware crc32c to do reduction from 64 -> 32 bytes + const uint64_t res2 = vgetq_lane_u64(res1, 0); + const uint32_t res3 = __crc32cw(0, res2); + const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1); + + return res3 ^ res4; +} + +inline uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) { + const uint64x2_t count = vsetq_lane_u64(0, vdupq_n_u64(1), 1); + + const poly128_t res0 = vmull_p64(crc2, crc1); + const uint64x2_t res1 = + vshlq_u64(vreinterpretq_u64_p128(res0), vreinterpretq_s64_u64(count)); + + // Use hardware crc32 to do reduction from 64 -> 32 bytes + const uint64_t res2 = vgetq_lane_u64(res1, 0); + const uint32_t res3 = __crc32w(0, res2); + const uint32_t res4 = vgetq_lane_u32(vreinterpretq_u32_u64(res1), 1); + + return res3 ^ res4; +} + +} // namespace folly + +#endif // FOLLY_ARM_FEATURE_CRC32 diff --git a/third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cDetail.cpp b/third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cDetail.cpp new file mode 100644 index 0000000000000..44835c573bc30 --- /dev/null +++ b/third-party/folly/src/folly/external/nvidia/hash/detail/Crc32cDetail.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#if defined(__aarch64__) + +#include + +#include + +#if FOLLY_ARM_FEATURE_CRC32 + +#include + +namespace folly::detail { + +uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) { + while (len >= 8) { + uint64_t val = 0; + std::memcpy(&val, buf, 8); + crc = __crc32cd(crc, val); + len -= 8; + buf += 8; + } + + if (len >= 4) { + uint32_t val = 0; + std::memcpy(&val, buf, 4); + crc = __crc32cw(crc, val); + len -= 4; + buf += 4; + } + + if (len >= 2) { + uint16_t val = 0; + std::memcpy(&val, buf, 2); + crc = __crc32ch(crc, val); + len -= 2; + buf += 2; + } + + if (len >= 1) { + crc = __crc32cb(crc, *buf); + } + + return crc; +} + +} // namespace folly::detail + +#endif // FOLLY_ARM_FEATURE_CRC32 + +#endif // __aarch64__ diff --git a/third-party/folly/src/folly/hash/Checksum.cpp b/third-party/folly/src/folly/hash/Checksum.cpp index dcb8ae8e58ba9..d2aaa1e6cafaa 100644 --- a/third-party/folly/src/folly/hash/Checksum.cpp +++ b/third-party/folly/src/folly/hash/Checksum.cpp @@ -71,6 +71,10 @@ uint32_t crc32_hw( } bool crc32c_hw_supported() { + return crc32c_hw_supported_sse42(); +} + +bool crc32c_hw_supported_sse42() { static folly::CpuId id; return id.sse42(); } @@ -86,7 +90,27 @@ bool crc32_hw_supported() { return id.sse42(); } -#else +#elif FOLLY_ARM_FEATURE_CRC32 + +// crc32_hw is defined in folly/external/nvidia/hash/Checksum.cpp + +bool crc32c_hw_supported() { + return true; +} + +bool crc32c_hw_supported_sse42() { + return false; +} + +bool crc32c_hw_supported_avx512() { + return false; +} + +bool crc32_hw_supported() { + return true; +} + +#else // FOLLY_ARM_FEATURE_CRC32 uint32_t crc32_hw( const uint8_t* /* data */, @@ -99,6 +123,10 @@ bool crc32c_hw_supported() { return false; } +bool crc32c_hw_supported_sse42() { + return false; +} + bool crc32c_hw_supported_avx512() { return false; } diff --git a/third-party/folly/src/folly/hash/detail/ChecksumDetail.h b/third-party/folly/src/folly/hash/detail/ChecksumDetail.h index 8bd1fb142d108..b0d525ad8d99a 100644 --- a/third-party/folly/src/folly/hash/detail/ChecksumDetail.h +++ b/third-party/folly/src/folly/hash/detail/ChecksumDetail.h @@ -46,6 +46,12 @@ uint32_t crc32c_hw( * Check whether a SSE4.2 hardware-accelerated CRC-32C implementation is * supported on the current CPU. */ +bool crc32c_hw_supported_sse42(); + +/** + * Check whether a hardware-accelerated CRC-32C implementation is + * supported on the current CPU. + */ bool crc32c_hw_supported(); /** diff --git a/third-party/folly/src/folly/hash/detail/Crc32CombineDetail.cpp b/third-party/folly/src/folly/hash/detail/Crc32CombineDetail.cpp index 806910a383edb..7d8959a2c6206 100644 --- a/third-party/folly/src/folly/hash/detail/Crc32CombineDetail.cpp +++ b/third-party/folly/src/folly/hash/detail/Crc32CombineDetail.cpp @@ -18,6 +18,7 @@ #include #include +#include #include namespace folly { @@ -105,6 +106,11 @@ static uint32_t gf_multiply_crc32_hw(uint64_t crc1, uint64_t crc2, uint32_t) { return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(res3, res1), 4)); } +#elif FOLLY_NEON && FOLLY_ARM_FEATURE_CRC32 && FOLLY_ARM_FEATURE_AES && \ + FOLLY_ARM_FEATURE_SHA2 + +// gf_multiply_crc32c_hw and fg_multiply_crc32_hw are defined in +// external/nvidia/hash/detail/Crc32cCombineDetail-inl.h #else static uint32_t gf_multiply_crc32c_hw(uint64_t, uint64_t, uint32_t) { @@ -114,7 +120,7 @@ static uint32_t gf_multiply_crc32_hw(uint64_t, uint64_t, uint32_t) { return 0; } -#endif +#endif // FOLLY_SSE_PREREQ(4, 2) static constexpr uint32_t crc32c_m = 0x82f63b78; static constexpr uint32_t crc32_m = 0xedb88320; diff --git a/third-party/folly/src/folly/hash/detail/Crc32cDetail.cpp b/third-party/folly/src/folly/hash/detail/Crc32cDetail.cpp index 63a6eb562dac0..36b474796de8c 100644 --- a/third-party/folly/src/folly/hash/detail/Crc32cDetail.cpp +++ b/third-party/folly/src/folly/hash/detail/Crc32cDetail.cpp @@ -286,6 +286,10 @@ uint32_t crc32c_hw(const uint8_t* buf, size_t len, uint32_t crc) { return (uint32_t)crc0; } +#elif FOLLY_ARM_FEATURE_CRC32 + +// crc32c_hw is defined in external/nvidia/hash/detail/Crc32cDetail.cpp + #else uint32_t crc32c_hw( @@ -293,7 +297,7 @@ uint32_t crc32c_hw( throw std::runtime_error("crc32_hw is not implemented on this platform"); } -#endif +#endif // !defined(FOLLY_ARM_FEATURE_CRC32) } // namespace detail } // namespace folly diff --git a/third-party/folly/src/folly/hash/test/ChecksumTest.cpp b/third-party/folly/src/folly/hash/test/ChecksumTest.cpp index 9929dbaea179a..4e7253dff6de6 100644 --- a/third-party/folly/src/folly/hash/test/ChecksumTest.cpp +++ b/third-party/folly/src/folly/hash/test/ChecksumTest.cpp @@ -147,7 +147,7 @@ TEST(Checksum, crc32cContinuationHardware) { } TEST(Checksum, crc32cHardwareSse42) { - if (folly::detail::crc32c_hw_supported()) { + if (folly::detail::crc32c_hw_supported_sse42()) { testCRC32C(folly::detail::sse_crc32c_v8s3x3); } else { LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" @@ -156,7 +156,7 @@ TEST(Checksum, crc32cHardwareSse42) { } TEST(Checksum, crc32cHardwareEqSse42) { - if (folly::detail::crc32c_hw_supported()) { + if (folly::detail::crc32c_hw_supported_sse42()) { for (size_t i = 0; i < 1000; i++) { auto sw = folly::detail::crc32c_sw(buffer, i, 0); auto hw = folly::detail::sse_crc32c_v8s3x3(buffer, i, 0); @@ -169,7 +169,7 @@ TEST(Checksum, crc32cHardwareEqSse42) { } TEST(Checksum, crc32cContinuationHardwareSse42) { - if (folly::detail::crc32c_hw_supported()) { + if (folly::detail::crc32c_hw_supported_sse42()) { testCRC32CContinuation(folly::detail::sse_crc32c_v8s3x3); } else { LOG(WARNING) << "skipping SSE4.2 hardware-accelerated CRC-32C tests" @@ -220,7 +220,7 @@ TEST(Checksum, crc32clargeBuffers) { constexpr uint32_t kCrc = 2860399007; - if (folly::detail::crc32c_hw_supported()) { + if (folly::detail::crc32c_hw_supported_sse42()) { auto crcSse42 = folly::detail::sse_crc32c_v8s3x3(bufp, kLargeBufSz, ~0); ASSERT_EQ(kCrc, crcSse42); auto crcHw = folly::detail::crc32c_hw(bufp, kLargeBufSz, ~0); @@ -242,7 +242,7 @@ TEST(Checksum, crc32cContinuationAutodetect) { } TEST(Checksum, crc32) { - if (folly::detail::crc32c_hw_supported()) { + if (folly::detail::crc32_hw_supported()) { // Just check that sw and hw match for (auto expected : expectedResults) { uint32_t sw_res = @@ -258,7 +258,7 @@ TEST(Checksum, crc32) { } TEST(Checksum, crc32Continuation) { - if (folly::detail::crc32c_hw_supported()) { + if (folly::detail::crc32_hw_supported()) { // Just check that sw and hw match for (auto expected : expectedResults) { auto halflen = expected.length / 2;