diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml new file mode 100644 index 00000000..6b83dc1f --- /dev/null +++ b/.github/FUNDING.yml @@ -0,0 +1,3 @@ +# These are supported funding model platforms + +github: kazcw diff --git a/.github/workflows/check-clippy.yaml b/.github/workflows/check-clippy.yaml new file mode 100644 index 00000000..bdc77a89 --- /dev/null +++ b/.github/workflows/check-clippy.yaml @@ -0,0 +1,18 @@ +name: Clippy + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +env: + CARGO_TERM_COLOR: always + +jobs: + clippy: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: cargo-clippy + run: cargo clippy diff --git a/.github/workflows/check-rustfmt.yaml b/.github/workflows/check-rustfmt.yaml new file mode 100644 index 00000000..fb582971 --- /dev/null +++ b/.github/workflows/check-rustfmt.yaml @@ -0,0 +1,15 @@ +name: Rustfmt + +on: + push: + branches: [ "master" ] + pull_request: + branches: [ "master" ] + +jobs: + format: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: cargo-fmt + run: cargo fmt -- --check diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml new file mode 100644 index 00000000..3cb563a0 --- /dev/null +++ b/.github/workflows/tests.yaml @@ -0,0 +1,168 @@ +name: Tests + +on: + push: + branches: [ master ] + pull_request: + branches: [ master ] + +permissions: + contents: read + +jobs: + #check-doc: + # name: Check doc + # runs-on: ubuntu-latest + # env: + # RUSTDOCFLAGS: "-Dwarnings --cfg docsrs -Zunstable-options --generate-link-to-definition" + # steps: + # - uses: actions/checkout@v4 + # - name: Install toolchain + # uses: dtolnay/rust-toolchain@master + # with: + # toolchain: nightly + # - name: Workspace docs + # run: cargo doc --all-features --no-deps + + test: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + toolchain: stable + - os: macos-latest + target: x86_64-apple-darwin + toolchain: stable + # TODO: also aarch64 / M1 + - os: windows-latest + target: x86_64-pc-windows-gnu + toolchain: stable + - os: windows-latest + target: x86_64-pc-windows-msvc + toolchain: beta + # Test both windows-gnu and windows-msvc; use beta rust on one + - os: ubuntu-latest + target: x86_64-unknown-linux-gnu + variant: MSRV + toolchain: 1.61.0 + # FIXME: some failures down the dependency tree + #- os: ubuntu-latest + # target: x86_64-unknown-linux-gnu + # toolchain: nightly + # variant: minimal_versions + + steps: + - uses: actions/checkout@v4 + - name: MSRV + if: ${{ matrix.variant == 'MSRV' }} + run: cp Cargo.lock.msrv Cargo.lock + - name: Install toolchain + uses: dtolnay/rust-toolchain@master + with: + target: ${{ matrix.target }} + toolchain: ${{ matrix.toolchain }} + - run: ${{ matrix.deps }} + - name: Maybe minimal versions + if: ${{ matrix.variant == 'minimal_versions' }} + run: | + cargo generate-lockfile -Z minimal-versions + - name: Test + run: | + cargo test --target ${{ matrix.target }} + + test-cross: + runs-on: ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + include: + - os: ubuntu-latest + target: powerpc-unknown-linux-gnu + toolchain: stable + + steps: + - uses: actions/checkout@v4 + - name: Install toolchain + uses: dtolnay/rust-toolchain@master + with: + target: ${{ matrix.target }} + toolchain: ${{ matrix.toolchain }} + - name: Cache cargo plugins + uses: actions/cache@v4 + with: + path: ~/.cargo/bin/ + key: ${{ runner.os }}-cargo-plugins + - name: Install cross + run: cargo install cross || true + - name: Test + run: | + cross test --no-fail-fast --target ${{ matrix.target }} -p c2-chacha + cross test --no-fail-fast --target ${{ matrix.target }} -p ppv-lite86 + cross test --no-fail-fast --target ${{ matrix.target }} -p ppv-null + cross test --no-fail-fast --target ${{ matrix.target }} -p crypto-simd + cross test --no-fail-fast --target ${{ matrix.target }} -p threefish-cipher + cross test --no-fail-fast --target ${{ matrix.target }} -p blake-hash + cross test --no-fail-fast --target ${{ matrix.target }} -p skein-hash + # Failing on PPC + # cross test --no-fail-fast --target ${{ matrix.target }} -p jh-x86_64 + # groestl-aesni: not cross-tested as it only supports specific hardware. + + test-miri: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install toolchain + run: | + rustup toolchain install nightly --component miri + rustup override set nightly + cargo miri setup + - name: Test + run: | + cargo miri test -p c2-chacha + cargo miri test -p ppv-lite86 + cargo miri test -p ppv-null + cargo miri test -p crypto-simd + cargo miri test -p threefish-cipher + cargo miri test -p blake-hash + cargo miri test -p skein-hash + # groestl-aesni: not tested as it only supports specific hardware. + # jh-x86_64: should work under miri but runs too slowly. + + test-no-std: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Install toolchain + uses: dtolnay/rust-toolchain@nightly + with: + target: thumbv6m-none-eabi + - name: Chacha, build only + run: cargo build -p c2-chacha --target=thumbv6m-none-eabi --no-default-features + + test-ios: + runs-on: macos-latest + steps: + - uses: actions/checkout@v4 + - name: Install toolchain + uses: dtolnay/rust-toolchain@nightly + with: + target: aarch64-apple-ios + - name: Chacha, build only + run: cargo build -p c2-chacha --target=aarch64-apple-ios + + test-686: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - name: Deps + run: sudo apt-get update ; sudo apt install gcc-multilib + - name: Install toolchain + uses: dtolnay/rust-toolchain@nightly + with: + target: i686-unknown-linux-gnu + toolchain: nightly + - name: Chacha + run: cargo test -p c2-chacha --target=i686-unknown-linux-gnu diff --git a/.travis.yml b/.travis.yml index 9752ea52..4b54219e 100644 --- a/.travis.yml +++ b/.travis.yml @@ -75,8 +75,13 @@ matrix: rust: nightly - env: TARGET=x86_64-unknown-linux-gnu rust: stable + - env: TARGET=x86_64-unknown-linux-gnu OLDER_MSRV_CRATES=1 DISABLE_TESTS=1 + rust: 1.32.0 - env: TARGET=x86_64-unknown-linux-gnu - rust: 1.31.1 + rust: 1.41.0 + + # machine-specific tests are skipped based on static feature detection + - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS="-C target-cpu=native" before_install: - set -e diff --git a/Cargo.lock.msrv b/Cargo.lock.msrv new file mode 100644 index 00000000..b0aaa00a --- /dev/null +++ b/Cargo.lock.msrv @@ -0,0 +1,230 @@ +# This file is automatically @generated by Cargo. +# It is not intended for manual editing. +version = 3 + +[[package]] +name = "autocfg" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" + +[[package]] +name = "blake-hash" +version = "0.4.1" +dependencies = [ + "block-buffer", + "digest", + "ppv-lite86", +] + +[[package]] +name = "blobby" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6fe5f8c2940b65859ece4b3b2ba02d2b12c87cab455fd42dee2556a187bb2cf6" +dependencies = [ + "byteorder", +] + +[[package]] +name = "block-buffer" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4" +dependencies = [ + "block-padding", + "generic-array", +] + +[[package]] +name = "block-padding" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8d696c370c750c948ada61c69a0ee2cbbb9c50b1019ddb86d9317157a99c2cae" + +[[package]] +name = "byteorder" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" + +[[package]] +name = "c2-chacha" +version = "0.3.3" +dependencies = [ + "cipher", + "hex-literal", + "ppv-lite86", +] + +[[package]] +name = "cc" +version = "1.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc" + +[[package]] +name = "cfg-if" +version = "1.0.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" + +[[package]] +name = "cipher" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7" +dependencies = [ + "generic-array", +] + +[[package]] +name = "crypto-simd" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "28a0eee94b5af99ac4441823c99f59b1ef92a6a4b9723b4c6ad95e8cd4c994b2" + +[[package]] +name = "crypto-simd" +version = "0.2.0" +dependencies = [ + "packed_simd", +] + +[[package]] +name = "digest" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066" +dependencies = [ + "blobby", + "generic-array", +] + +[[package]] +name = "generic-array" +version = "0.14.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a" +dependencies = [ + "typenum", + "version_check", +] + +[[package]] +name = "groestl-aesni" +version = "0.3.0" +dependencies = [ + "block-buffer", + "digest", + "lazy_static", +] + +[[package]] +name = "hex-literal" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d70693199b3cf4552f3fa720b54163927a3ebed2aef240efaf556033ab336a11" +dependencies = [ + "hex-literal-impl", + "proc-macro-hack", +] + +[[package]] +name = "hex-literal-impl" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "59448fc2f82a5fb6907f78c3d69d843e82ff5b051923313cc4438cb0c7b745a8" +dependencies = [ + "proc-macro-hack", +] + +[[package]] +name = "jh-x86_64" +version = "0.3.0" +dependencies = [ + "block-buffer", + "cc", + "digest", + "hex-literal", + "ppv-lite86", +] + +[[package]] +name = "lazy_static" +version = "1.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe" + +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + +[[package]] +name = "num-traits" +version = "0.2.19" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841" +dependencies = [ + "autocfg", + "libm", +] + +[[package]] +name = "packed_simd" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1f9f08af0c877571712e2e3e686ad79efad9657dbf0f7c3c8ba943ff6c38932d" +dependencies = [ + "cfg-if", + "num-traits", +] + +[[package]] +name = "ppv-lite86" +version = "0.2.17" + +[[package]] +name = "ppv-null" +version = "0.2.0" +dependencies = [ + "crypto-simd 0.1.1", +] + +[[package]] +name = "proc-macro-hack" +version = "0.5.20+deprecated" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068" + +[[package]] +name = "skein-hash" +version = "0.3.1" +dependencies = [ + "block-buffer", + "cipher", + "digest", + "threefish-cipher", +] + +[[package]] +name = "threefish-cipher" +version = "0.4.0" +dependencies = [ + "cipher", + "hex-literal", +] + +[[package]] +name = "typenum" +version = "1.17.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" diff --git a/Cargo.toml b/Cargo.toml index fb9780d0..174e5c36 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,6 +10,7 @@ members = [ "utils-simd/ppv-lite86", "utils-simd/ppv-null", ] +resolver = "2" [patch.crates-io] c2-chacha = { path = "stream-ciphers/chacha" } diff --git a/LICENSE-APACHE b/LICENSE-APACHE new file mode 100644 index 00000000..1eb32153 --- /dev/null +++ b/LICENSE-APACHE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + +TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + +1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + +2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + +3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + +4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + +5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + +6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + +7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + +8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + +9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + +END OF TERMS AND CONDITIONS + +APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + +Copyright 2019 The CryptoCorrosion Contributors + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/LICENSE-MIT b/LICENSE-MIT new file mode 100644 index 00000000..d78c961b --- /dev/null +++ b/LICENSE-MIT @@ -0,0 +1,25 @@ +Copyright (c) 2019 The CryptoCorrosion Contributors + +Permission is hereby granted, free of charge, to any +person obtaining a copy of this software and associated +documentation files (the "Software"), to deal in the +Software without restriction, including without +limitation the rights to use, copy, modify, merge, +publish, distribute, sublicense, and/or sell copies of +the Software, and to permit persons to whom the Software +is furnished to do so, subject to the following +conditions: + +The above copyright notice and this permission notice +shall be included in all copies or substantial portions +of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF +ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED +TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A +PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT +SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR +IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/README.md b/README.md index 65ab2511..9f2d6e89 100644 --- a/README.md +++ b/README.md @@ -6,9 +6,7 @@ The main interface to these crates is the RustCrypto traits. All crates are no-std compatible. -Minimum Rust version: 1.31. - -[![Build Status](https://travis-ci.org/cryptocorrosion/cryptocorrosion.svg?branch=master)](https://travis-ci.org/cryptocorrosion/cryptocorrosion) +Minimum Rust version: 1.61.0 ## Supported algorithms @@ -36,26 +34,6 @@ runtime CPU detection is not yet supported. | ---------- | ---------- | ------------------ | | ChaCha | c2-chacha | :heavy_check_mark: | -## SIMD - -Many of the crates in this project include optimized SIMD implementations, -enabled by default on x86-64 by the "simd" feature. The fastest implementation -available for your hardware will be automatically selected at runtime, except -in no-std builds. - -For other hardware platforms, e.g. ARM: an alternative, portable SIMD backend -based on the packed\_simd crate is available for recent nightly Rust; you can -enable it as "packed\_simd". - -If you'd prefer to minimize usage of `unsafe` code: disable the "simd" feature -to switch to a generic implementation. - -| feature | crate | no `unsafe` | rust version | build time? | performance | -| -------------- | ------------ | ------------------ | -------------- | ----------- | ------------- | -| simd (default) | ppv\_lite86 | :x: | 1.27 | fast | fast | -| (no simd) | ppv\_null | :heavy_check_mark: | | fast | slow | -| packed\_simd | packed\_simd | | recent nightly | slow | fast | - ## License All crates licensed under either of diff --git a/block-ciphers/threefish/CHANGELOG.md b/block-ciphers/threefish/CHANGELOG.md new file mode 100644 index 00000000..44b58459 --- /dev/null +++ b/block-ciphers/threefish/CHANGELOG.md @@ -0,0 +1,9 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +### Changed +- Update `cipher` dependency: 0.2 -> 0.3. diff --git a/block-ciphers/threefish/Cargo.toml b/block-ciphers/threefish/Cargo.toml index 122e0b04..192cd77c 100644 --- a/block-ciphers/threefish/Cargo.toml +++ b/block-ciphers/threefish/Cargo.toml @@ -1,20 +1,20 @@ [package] name = "threefish-cipher" -version = "0.3.1" +version = "0.4.0" authors = ["The Rust-Crypto Project Developers", "The Cryptocorrosion Contributors"] license = "MIT/Apache-2.0" description = "Threefish block cipher" documentation = "https://docs.rs/threefish" repository = "https://github.com/cryptocorrosion/cryptocorrosion" keywords = ["crypto", "threefish", "gost", "block-cipher"] +edition = "2021" +rust-version = "1.61" [dependencies] -generic-array = "0.12" -byteorder = { version = "1", default-features = false } -block-cipher-trait = "0.6" +cipher = "0.3" [dev-dependencies] -hex-literal = "0.2" +hex-literal = "0.3" [features] no_unroll = [] diff --git a/block-ciphers/threefish/src/lib.rs b/block-ciphers/threefish/src/lib.rs index e5c46d59..19d911ed 100644 --- a/block-ciphers/threefish/src/lib.rs +++ b/block-ciphers/threefish/src/lib.rs @@ -1,20 +1,15 @@ #![no_std] #![allow(non_upper_case_globals)] -extern crate block_cipher_trait; -extern crate byteorder; -extern crate generic_array; -#[cfg(test)] -#[macro_use] -extern crate hex_literal; + +use core::convert::TryInto; use core::ops::BitXor; mod consts; use consts::{C240, P_1024, P_256, P_512, R_1024, R_256, R_512}; -use block_cipher_trait::generic_array::typenum::{U1, U128, U32, U64}; -use block_cipher_trait::generic_array::GenericArray; -pub use block_cipher_trait::BlockCipher; -use byteorder::{ByteOrder, LE}; +use cipher::generic_array::typenum::{U1, U128, U32, U64}; +use cipher::generic_array::GenericArray; +use cipher::{BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher}; fn mix(r: u32, x: (u64, u64)) -> (u64, u64) { let y0 = x.0.wrapping_add(x.1); @@ -30,13 +25,13 @@ fn inv_mix(r: u32, y: (u64, u64)) -> (u64, u64) { fn read_u64v_le(ns: &mut [u64], buf: &[u8]) { for (c, n) in buf.chunks_exact(8).zip(ns) { - *n = LE::read_u64(c); + *n = u64::from_le_bytes(c.try_into().unwrap()); } } fn write_u64v_le(buf: &mut [u8], ns: &[u64]) { for (c, n) in buf.chunks_exact_mut(8).zip(ns) { - LE::write_u64(c, *n); + c.copy_from_slice(&n.to_le_bytes()); } } @@ -116,19 +111,24 @@ macro_rules! impl_threefish( } } - $name { sk: sk } + $name { sk } } } - impl BlockCipher for $name { - type BlockSize = $block_size; + impl NewBlockCipher for $name { type KeySize = $block_size; - type ParBlocks = U1; fn new(key: &GenericArray) -> $name { Self::with_tweak(key, 0, 0) } + } + impl BlockCipher for $name { + type BlockSize = $block_size; + type ParBlocks = U1; + } + + impl BlockEncrypt for $name { fn encrypt_block(&self, block: &mut GenericArray) { let mut v = [0u64; $n_w]; @@ -162,7 +162,9 @@ macro_rules! impl_threefish( write_u64v_le(block, &v[..]); } + } + impl BlockDecrypt for $name { fn decrypt_block(&self, block: &mut GenericArray) { let mut v = [0u64; $n_w]; @@ -208,8 +210,9 @@ mod test { //! tests from NIST submission use super::{Threefish1024, Threefish256, Threefish512}; - use block_cipher_trait::generic_array::GenericArray; - use block_cipher_trait::BlockCipher; + use cipher::generic_array::GenericArray; + use cipher::{BlockDecrypt, BlockEncrypt, NewBlockCipher}; + use hex_literal::hex; #[test] fn test_256() { diff --git a/ci/script.sh b/ci/script.sh index 6bf55341..268c5483 100644 --- a/ci/script.sh +++ b/ci/script.sh @@ -27,11 +27,25 @@ portable_only() { cross test --target $TARGET --release -p c2-chacha -p ppv-lite86 } +older_msrv_crates() { + cross build --target $TARGET -p ppv-lite86 + cross build --target $TARGET --release -p ppv-lite86 + + if [ ! -z $DISABLE_TESTS ]; then + return + fi + + cross test --target $TARGET -p ppv-lite86 + cross test --target $TARGET --release -p ppv-lite86 +} + # we don't run the "test phase" when doing deploys if [ -z $TRAVIS_TAG ]; then - if [ -z $PORTABLE_ONLY ]; then - main - else + if [ -n "$PORTABLE_ONLY" ]; then portable_only + elif [ -n "$OLDER_MSRV_CRATES" ]; then + older_msrv_crates + else + main fi fi diff --git a/hashes/blake/CHANGELOG.md b/hashes/blake/CHANGELOG.md new file mode 100644 index 00000000..adb13026 --- /dev/null +++ b/hashes/blake/CHANGELOG.md @@ -0,0 +1,9 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.4.1] +### Changed +- Update `ppv-lite86` dependency to fix support for non-x86 platforms. diff --git a/hashes/blake/Cargo.toml b/hashes/blake/Cargo.toml index d38346b0..c0f37d5b 100644 --- a/hashes/blake/Cargo.toml +++ b/hashes/blake/Cargo.toml @@ -1,24 +1,25 @@ [package] name = "blake-hash" -version = "0.3.2" +version = "0.4.1" authors = ["The CryptoCorrosion Contributors"] license = "MIT/Apache-2.0" description = "BLAKE hash functions" repository = "https://github.com/cryptocorrosion/cryptocorrosion" keywords = ["crypto", "blake", "hash", "digest"] categories = ["cryptography", "no-std"] +rust-version = "1.61" [dependencies] -block-buffer = "0.7" -digest = "0.8" -simd = { package = "ppv-lite86", version = "0.2.6", optional = true } +block-buffer = "0.9" +digest = "0.9" +simd = { package = "ppv-lite86", version = "0.2.16", optional = true } [features] default = ["simd", "std"] std = [] [dev-dependencies] -digest = { version = "0.8", features = ["dev"] } +digest = { version = "0.9", features = ["dev"] } [badges] travis-ci = { repository = "cryptocorrosion/cryptocorrosion" } diff --git a/hashes/blake/src/lib.rs b/hashes/blake/src/lib.rs index 3ddf349c..098d088f 100644 --- a/hashes/blake/src/lib.rs +++ b/hashes/blake/src/lib.rs @@ -8,13 +8,12 @@ use std as core; extern crate block_buffer; pub extern crate digest; -#[macro_use] pub extern crate simd; mod consts; -use block_buffer::byteorder::{ByteOrder, BE}; use block_buffer::BlockBuffer; +use core::convert::TryInto; use core::mem; use digest::generic_array::typenum::{PartialDiv, Unsigned, U2}; use digest::generic_array::GenericArray; @@ -73,16 +72,17 @@ fn round64( #[inline(always)] fn diagonalize((a, b, c, d): (X4, X4, X4, X4)) -> (X4, X4, X4, X4) { - (a, b.shuffle3012(), c.shuffle2301(), d.shuffle1230()) + // Since b has the critical data dependency, avoid rotating b to hide latency. + (a.shuffle1230(), b, c.shuffle3012(), d.shuffle2301()) } #[inline(always)] fn undiagonalize((a, b, c, d): (X4, X4, X4, X4)) -> (X4, X4, X4, X4) { - (a, b.shuffle1230(), c.shuffle2301(), d.shuffle3012()) + (a.shuffle3012(), b, c.shuffle1230(), d.shuffle2301()) } macro_rules! define_compressor { - ($compressor:ident, $storage:ident, $word:ident, $Bufsz:ty, $deserializer:path, $uval:expr, $rounds:expr, $round:ident, $X4:ident) => { + ($compressor:ident, $storage:ident, $word:ident, $Bufsz:ty, $uval:expr, $rounds:expr, $round:ident, $X4:ident) => { #[derive(Clone, Copy, Default)] pub struct $compressor { h: [$storage; 2], @@ -100,7 +100,7 @@ macro_rules! define_compressor { .iter_mut() .zip(block.chunks_exact(mem::size_of::<$word>())) { - *mx = $deserializer(b); + *mx = $word::from_be_bytes(b.try_into().unwrap()); } let u = (mach.vec([U[0], U[1], U[2], U[3]]), mach.vec([U[4], U[5], U[6], U[7]])); @@ -114,8 +114,8 @@ macro_rules! define_compressor { let m1 = mach.vec([m1!(0), m1!(2), m1!(4), m1!(6)]); xs = $round::(xs, m0, m1); // diagonal step - let m0 = mach.vec([m0!(8), m0!(10), m0!(12), m0!(14)]); - let m1 = mach.vec([m1!(8), m1!(10), m1!(12), m1!(14)]); + let m0 = mach.vec([m0!(14), m0!(8), m0!(10), m0!(12)]); + let m1 = mach.vec([m1!(14), m1!(8), m1!(10), m1!(12)]); xs = undiagonalize($round::(diagonalize(xs), m0, m1)); } let h: (M::$X4, M::$X4) = (mach.unpack(state.h[0]), mach.unpack(state.h[1])); @@ -156,7 +156,7 @@ macro_rules! define_compressor { macro_rules! define_hasher { ($name:ident, $word:ident, $buf:expr, $Bufsz:ty, $bits:expr, $Bytes:ident, - $serializer:path, $compressor:ident, $iv:expr) => { + $compressor:ident, $iv:expr) => { #[derive(Clone)] pub struct $name { compressor: $compressor, @@ -196,30 +196,30 @@ macro_rules! define_hasher { type BlockSize = $Bytes; } - impl digest::Input for $name { - fn input>(&mut self, data: T) { + impl digest::Update for $name { + fn update(&mut self, data: impl AsRef<[u8]>) { let compressor = &mut self.compressor; let t = &mut self.t; - self.buffer.input(data.as_ref(), |block| { + self.buffer.input_block(data.as_ref(), |block| { Self::increase_count(t, (mem::size_of::<$word>() * 16) as $word); compressor.put_block(block, *t); }); } } - impl digest::FixedOutput for $name { + impl digest::FixedOutputDirty for $name { type OutputSize = $Bytes; - fn fixed_result(self) -> GenericArray { + fn finalize_into_dirty(&mut self, out: &mut GenericArray) { let mut compressor = self.compressor; - let mut buffer = self.buffer; + let buffer = &mut self.buffer; let mut t = self.t; Self::increase_count(&mut t, buffer.position() as $word); let mut msglen = [0u8; $buf / 8]; - $serializer(&mut msglen[..$buf / 16], t.1); - $serializer(&mut msglen[$buf / 16..], t.0); + msglen[..$buf / 16].copy_from_slice(&t.1.to_be_bytes()); + msglen[$buf / 16..].copy_from_slice(&t.0.to_be_bytes()); let footerlen = 1 + 2 * mem::size_of::<$word>(); @@ -237,7 +237,7 @@ macro_rules! define_hasher { let extra_block = buffer.position() + footerlen > $buf; if extra_block { let pad = $buf - buffer.position(); - buffer.input(&PADDING[..pad], |block| compressor.put_block(block, t)); + buffer.input_block(&PADDING[..pad], |block| compressor.put_block(block, t)); debug_assert_eq!(buffer.position(), 0); } @@ -249,12 +249,12 @@ macro_rules! define_hasher { // skip begin-padding byte if continuing padding let x = extra_block as usize; let (start, end) = (x, x + ($buf - footerlen - buffer.position())); - buffer.input(&PADDING[start..end], |_| unreachable!()); - buffer.input(&[magic], |_| unreachable!()); - buffer.input(&msglen, |block| compressor.put_block(block, t)); + buffer.input_block(&PADDING[start..end], |_| unreachable!()); + buffer.input_block(&[magic], |_| unreachable!()); + buffer.input_block(&msglen, |block| compressor.put_block(block, t)); debug_assert_eq!(buffer.position(), 0); - GenericArray::clone_from_slice(&compressor.finalize()[..$Bytes::to_usize()]) + out.copy_from_slice(&compressor.finalize()[..$Bytes::to_usize()]); } } @@ -272,19 +272,19 @@ use consts::{ use digest::generic_array::typenum::{U128, U28, U32, U48, U64}; #[rustfmt::skip] -define_compressor!(Compressor256, vec128_storage, u32, U64, BE::read_u32, BLAKE256_U, 14, round32, u32x4); +define_compressor!(Compressor256, vec128_storage, u32, U64, BLAKE256_U, 14, round32, u32x4); #[rustfmt::skip] -define_hasher!(Blake224, u32, 64, U64, 224, U28, BE::write_u32, Compressor256, BLAKE224_IV); +define_hasher!(Blake224, u32, 64, U64, 224, U28, Compressor256, BLAKE224_IV); #[rustfmt::skip] -define_hasher!(Blake256, u32, 64, U64, 256, U32, BE::write_u32, Compressor256, BLAKE256_IV); +define_hasher!(Blake256, u32, 64, U64, 256, U32, Compressor256, BLAKE256_IV); #[rustfmt::skip] -define_compressor!(Compressor512, vec256_storage, u64, U128, BE::read_u64, BLAKE512_U, 16, round64, u64x4); +define_compressor!(Compressor512, vec256_storage, u64, U128, BLAKE512_U, 16, round64, u64x4); #[rustfmt::skip] -define_hasher!(Blake384, u64, 128, U128, 384, U48, BE::write_u64, Compressor512, BLAKE384_IV); +define_hasher!(Blake384, u64, 128, U128, 384, U48, Compressor512, BLAKE384_IV); #[rustfmt::skip] -define_hasher!(Blake512, u64, 128, U128, 512, U64, BE::write_u64, Compressor512, BLAKE512_IV); +define_hasher!(Blake512, u64, 128, U128, 512, U64, Compressor512, BLAKE512_IV); diff --git a/hashes/blake/tests/lib.rs b/hashes/blake/tests/lib.rs index 856a1861..a52edbf9 100644 --- a/hashes/blake/tests/lib.rs +++ b/hashes/blake/tests/lib.rs @@ -6,5 +6,5 @@ use digest::dev::digest_test; new_test!(blake224, "blake224", blake_hash::Blake224, digest_test); new_test!(blake256, "blake256", blake_hash::Blake256, digest_test); -new_test!(blake384, "blake384", blake_hash::Blake384, digest_test); -new_test!(blake512, "blake512", blake_hash::Blake512, digest_test); +//new_test!(blake384, "blake384", blake_hash::Blake384, digest_test); +//new_test!(blake512, "blake512", blake_hash::Blake512, digest_test); diff --git a/hashes/groestl/Cargo.toml b/hashes/groestl/Cargo.toml index dc0b8469..41b0edbb 100644 --- a/hashes/groestl/Cargo.toml +++ b/hashes/groestl/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "groestl-aesni" -version = "0.2.2" +version = "0.3.1" authors = ["The CryptoCorrosion Contributors"] license = "MIT/Apache-2.0" description = "Hardware-accelerated Groestl hash for x86-64 systems with AES extensions" @@ -8,15 +8,17 @@ documentation = "https://docs.rs/groestl-aesni" keywords = ["crypto", "groestl", "hash", "digest"] categories = ["cryptography", "no-std"] repository = "https://github.com/cryptocorrosion/hashes" -edition = "2018" +edition = "2021" +rust-version = "1.61" [dependencies] -block-buffer = "0.7" -digest = "0.8" +block-buffer = "0.9" +digest = "0.9" lazy_static = { version = "1.2", optional = true } +zerocopy = { version = "0.7", features = ["simd", "derive"] } [dev-dependencies] -digest = { version = "0.8", features = ["dev"] } +digest = { version = "0.9", features = ["dev"] } [features] std = ["lazy_static"] diff --git a/hashes/groestl/src/compressor.rs b/hashes/groestl/src/compressor.rs index d20da74d..fc6b5fd4 100644 --- a/hashes/groestl/src/compressor.rs +++ b/hashes/groestl/src/compressor.rs @@ -2,6 +2,7 @@ use block_buffer::generic_array::typenum::{U128, U64}; use block_buffer::generic_array::GenericArray; use core::arch::x86_64::*; use core::ops::BitXor; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; trait Map2 { type Output; @@ -11,7 +12,8 @@ trait Map2 { Self: Sized; } -#[derive(Copy, Clone)] +#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)] +#[repr(C)] pub struct X4(__m128i, __m128i, __m128i, __m128i); #[derive(Copy, Clone)] @@ -47,7 +49,7 @@ impl BitXor for X4 { impl Map2 for (X4, X4) { type Output = X4; #[inline(always)] - fn map(self: Self, mut f: F) -> Self::Output + fn map(self, mut f: F) -> Self::Output where F: FnMut(__m128i, __m128i) -> __m128i, { @@ -115,7 +117,7 @@ impl BitXor for X8 { impl Map2 for (X8, X8) { type Output = X8; #[inline(always)] - fn map(self: Self, mut f: F) -> Self::Output + fn map(self, mut f: F) -> Self::Output where F: FnMut(__m128i, __m128i) -> __m128i, { @@ -175,7 +177,8 @@ unsafe fn transpose_a(i: X4) -> X4 { _mm_unpackhi_epi16(i.0, i.1), _mm_unpacklo_epi16(i.2, i.3), _mm_unpackhi_epi16(i.2, i.3), - ).map(|x| _mm_shuffle_epi32(x, 0b1101_1000)); + ) + .map(|x| _mm_shuffle_epi32(x, 0b1101_1000)); X4( _mm_unpacklo_epi32(z.0, z.2), _mm_unpacklo_epi32(z.1, z.3), @@ -367,7 +370,8 @@ unsafe fn transpose_inv(i: X8) -> X8 { _mm_unpackhi_epi64(i.4, i.5), _mm_unpacklo_epi64(i.6, i.7), _mm_unpackhi_epi64(i.6, i.7), - ).map(|x| { + ) + .map(|x| { _mm_shuffle_epi8( x, _mm_set_epi64x(0x0f07_0b03_0e06_0a02, 0x0d05_0901_0c04_0800), @@ -382,7 +386,8 @@ unsafe fn transpose_inv(i: X8) -> X8 { _mm_unpacklo_epi16(i.5, i.7), _mm_unpackhi_epi16(i.4, i.6), _mm_unpackhi_epi16(i.5, i.7), - ).map(|x| _mm_shuffle_epi32(x, 0b1101_1000)); + ) + .map(|x| _mm_shuffle_epi32(x, 0b1101_1000)); X8( _mm_unpacklo_epi32(i.0, i.4), _mm_unpacklo_epi32(i.2, i.6), @@ -446,7 +451,8 @@ unsafe fn rounds_q(mut x: X8) -> X8 { _mm_set_epi64x(0x080b_0e01_0407_0a0d, 0x0003_0609_0c0f_0205), _mm_set_epi64x(0x090c_0f02_0508_0b0e, 0x0104_070a_0d00_0306), _mm_set_epi64x(0x0e01_0407_0a0d_0003, 0x0609_0c0f_0205_080b), - ).shuffle((1, 3, 5, 7, 0, 2, 4, 6)); + ) + .shuffle((1, 3, 5, 7, 0, 2, 4, 6)); let f = _mm_set1_epi64x(0xffff_ffff_ffff_ffffu64 as i64); for q in const_q.chunks_exact(2) { // 2 rounds at a time so we can flip-flop between register sets diff --git a/hashes/groestl/src/lib.rs b/hashes/groestl/src/lib.rs index 212a464f..535ccc2c 100644 --- a/hashes/groestl/src/lib.rs +++ b/hashes/groestl/src/lib.rs @@ -10,7 +10,6 @@ pub extern crate digest; #[macro_use] extern crate lazy_static; -use block_buffer::byteorder::{BigEndian, ByteOrder, LE}; use block_buffer::generic_array::typenum::{ PartialDiv, Unsigned, U1024, U128, U16, U28, U48, U512, U64, U8, }; @@ -19,6 +18,7 @@ use block_buffer::BlockBuffer; use core::fmt::{Debug, Formatter, Result}; use digest::generic_array::GenericArray as DGenericArray; pub use digest::Digest; +use zerocopy::transmute; mod compressor; use crate::compressor::{init1024, init512, of1024, of512, tf1024, tf512}; @@ -27,25 +27,21 @@ use crate::compressor::{init1024, init512, of1024, of512, tf1024, tf512}; struct Align16(T); type Block512 = [u64; 512 / 64]; -union CvBytes512 { - block: Block512, - cv: compressor::X4, -} #[derive(Clone)] struct Compressor512 { cv: compressor::X4, } impl Compressor512 { fn new(block: Block512) -> Self { - let cv = init512(unsafe { CvBytes512 { block }.cv }); + let cv = init512(transmute!(block)); Compressor512 { cv } } fn input(&mut self, data: &BBGenericArray) { tf512(&mut self.cv, data); } - fn finalize(mut self) -> Block512 { + fn finalize_dirty(&mut self) -> Block512 { of512(&mut self.cv); - unsafe { CvBytes512 { cv: self.cv }.block } + transmute!(self.cv) } } @@ -66,7 +62,7 @@ impl Compressor1024 { fn input(&mut self, data: &BBGenericArray) { tf1024(&mut self.cv, data); } - fn finalize(mut self) -> Block1024 { + fn finalize_dirty(&mut self) -> Block1024 { of1024(&mut self.cv); unsafe { CvBytes1024 { cv: self.cv }.block } } @@ -87,16 +83,16 @@ macro_rules! impl_digest { let compressor = $compressor::new(iv.0); Self { buffer: BlockBuffer::default(), - compressor: compressor, + compressor, block_counter: 0, } } - fn finalize(self) -> [u64; $bits::USIZE / 64] { - let mut buffer = self.buffer; - let mut compressor = self.compressor; + fn finalize_dirty(&mut self) -> [u64; $bits::USIZE / 64] { + let buffer = &mut self.buffer; + let compressor = &mut self.compressor; let count = self.block_counter + 1 + (buffer.remaining() <= 8) as u64; - buffer.len64_padding::(count, |b| compressor.input(b)); - compressor.finalize() + buffer.len64_padding_be(count, |b| compressor.input(b)); + compressor.finalize_dirty() } } impl Default for $groestl { @@ -112,25 +108,23 @@ macro_rules! impl_digest { impl digest::BlockInput for $groestl { type BlockSize = <$bits as PartialDiv>::Output; } - impl digest::Input for $groestl { - fn input>(&mut self, data: T) { + impl digest::Update for $groestl { + fn update(&mut self, data: impl AsRef<[u8]>) { let block_counter = &mut self.block_counter; let compressor = &mut self.compressor; - self.buffer.input(data.as_ref(), |b| { + self.buffer.input_block(data.as_ref(), |b| { *block_counter += 1; compressor.input(b) }); } } - impl digest::FixedOutput for $groestl { + impl digest::FixedOutputDirty for $groestl { type OutputSize = <$bits as PartialDiv>::Output; - fn fixed_result(self) -> DGenericArray { - let result = self.finalize(); - let mut out: DGenericArray = DGenericArray::default(); + fn finalize_into_dirty(&mut self, out: &mut DGenericArray) { + let result = self.finalize_dirty(); for (out, &input) in out.chunks_exact_mut(8).zip(&result[$bits::USIZE / 128..]) { - LE::write_u64(out, input); + out.copy_from_slice(&input.to_le_bytes()); } - out } } impl digest::Reset for $groestl { @@ -154,21 +148,19 @@ impl Default for Groestl224 { impl digest::BlockInput for Groestl224 { type BlockSize = U64; } -impl digest::Input for Groestl224 { - fn input>(&mut self, data: T) { - digest::Input::input(&mut self.0, data.as_ref()); +impl digest::Update for Groestl224 { + fn update(&mut self, data: impl AsRef<[u8]>) { + digest::Update::update(&mut self.0, data.as_ref()); } } -impl digest::FixedOutput for Groestl224 { +impl digest::FixedOutputDirty for Groestl224 { type OutputSize = U28; - fn fixed_result(self) -> DGenericArray { - let result = self.0.finalize(); - let mut out: DGenericArray = DGenericArray::default(); - LE::write_u32(&mut out[..4], (result[4] >> 32) as u32); + fn finalize_into_dirty(&mut self, out: &mut DGenericArray) { + let result = self.0.finalize_dirty(); + out[..4].copy_from_slice(&((result[4] >> 32) as u32).to_le_bytes()); for (out, &input) in out[4..].chunks_exact_mut(8).zip(&result[5..8]) { - LE::write_u64(out, input); + out.copy_from_slice(&input.to_le_bytes()); } - out } } impl digest::Reset for Groestl224 { @@ -187,20 +179,18 @@ impl Default for Groestl384 { impl digest::BlockInput for Groestl384 { type BlockSize = ::BlockSize; } -impl digest::Input for Groestl384 { - fn input>(&mut self, data: T) { - digest::Input::input(&mut self.0, data.as_ref()); +impl digest::Update for Groestl384 { + fn update(&mut self, data: impl AsRef<[u8]>) { + digest::Update::update(&mut self.0, data.as_ref()); } } -impl digest::FixedOutput for Groestl384 { +impl digest::FixedOutputDirty for Groestl384 { type OutputSize = U48; - fn fixed_result(self) -> DGenericArray { - let result = self.0.finalize(); - let mut out: DGenericArray = DGenericArray::default(); + fn finalize_into_dirty(&mut self, out: &mut DGenericArray) { + let result = self.0.finalize_dirty(); for (out, &input) in out.chunks_exact_mut(8).zip(&result[10..]) { - LE::write_u64(out, input); + out.copy_from_slice(&input.to_le_bytes()); } - out } } impl digest::Reset for Groestl384 { diff --git a/hashes/jh/Cargo.toml b/hashes/jh/Cargo.toml index e8bb569f..42d02b6a 100644 --- a/hashes/jh/Cargo.toml +++ b/hashes/jh/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "jh-x86_64" -version = "0.2.2" +version = "0.3.1" authors = ["The CryptoCorrosion Contributors"] license = "MIT/Apache-2.0" description = "Portable JH with optimizations for x86-64 cpus" @@ -8,16 +8,18 @@ documentation = "https://docs.rs/jh-x86_64" keywords = ["crypto", "jh", "hash", "digest"] categories = ["cryptography", "no-std"] repository = "https://github.com/cryptocorrosion/cryptocorrosion" -edition = "2018" +edition = "2021" +rust-version = "1.61" [dependencies] -block-buffer = "0.7" -digest = "0.8" -hex-literal = "0.2" +block-buffer = { version = "0.9", features = ["block-padding"] } +digest = "0.9" +hex-literal = "0.3" simd = { package = "ppv-lite86", version = "0.2.6" } +zerocopy = "0.7" [dev-dependencies] -digest = { version = "0.8", features = ["dev"] } +digest = { version = "0.9", features = ["dev"] } [build-dependencies] cc = "1.0.3" diff --git a/hashes/jh/src/compressor.rs b/hashes/jh/src/compressor.rs index 57b02cd7..eaf885d7 100644 --- a/hashes/jh/src/compressor.rs +++ b/hashes/jh/src/compressor.rs @@ -4,6 +4,7 @@ use core::ptr; use digest::generic_array::typenum::U64; use digest::generic_array::GenericArray; use simd::{vec128_storage, AndNot, Machine, Swap64, VZip, Vec2}; +use zerocopy::transmute; const E8_BITSLICE_ROUNDCONSTANT: [[u8; 32]; 42] = [ hex!("72d5dea2df15f8677b84150ab723155781abd6904d5a87f64e9f4fc5c3d12b40"), @@ -199,21 +200,23 @@ dispatch!(mach, M, { }); #[derive(Clone, Copy)] -pub union Compressor { +pub struct Compressor { cv: [vec128_storage; 8], - bytes: [u8; 128], } + impl Compressor { #[inline] pub fn new(bytes: [u8; 128]) -> Self { - Compressor { bytes } + Compressor { + cv: transmute!(bytes), + } } #[inline] pub fn input(&mut self, data: &GenericArray) { - f8(unsafe { &mut self.cv }, data.as_ptr()); + f8(&mut self.cv, data.as_ptr()) } #[inline] pub fn finalize(self) -> [u8; 128] { - unsafe { self.bytes } + transmute!(self.cv) } } diff --git a/hashes/jh/src/lib.rs b/hashes/jh/src/lib.rs index 1cded6db..a04049d5 100644 --- a/hashes/jh/src/lib.rs +++ b/hashes/jh/src/lib.rs @@ -15,11 +15,10 @@ mod consts; pub use digest::Digest; -use block_buffer::byteorder::{BigEndian, ByteOrder}; +use crate::compressor::Compressor; use block_buffer::generic_array::GenericArray as BBGenericArray; use block_buffer::BlockBuffer; use core::fmt::{Debug, Formatter, Result}; -use crate::compressor::Compressor; use digest::generic_array::typenum::{Unsigned, U28, U32, U48, U64}; use digest::generic_array::GenericArray as DGenericArray; @@ -56,33 +55,33 @@ macro_rules! define_hasher { type BlockSize = U64; } - impl digest::Input for $name { - fn input>(&mut self, data: T) { + impl digest::Update for $name { + fn update(&mut self, data: impl AsRef<[u8]>) { let data = data.as_ref(); self.datalen += data.len(); let state = &mut self.state; - self.buffer.input(data, |b| state.input(b)) + self.buffer.input_block(data, |b| state.input(b)) } } - impl digest::FixedOutput for $name { + impl digest::FixedOutputDirty for $name { type OutputSize = $OutputBytes; - fn fixed_result(mut self) -> DGenericArray { + fn finalize_into_dirty(&mut self, out: &mut DGenericArray) { let state = &mut self.state; let buffer = &mut self.buffer; let len = self.datalen as u64 * 8; if buffer.position() == 0 { - buffer.len64_padding::(len, |b| state.input(b)); + buffer.len64_padding_be(len, |b| state.input(b)); } else { use block_buffer::block_padding::Iso7816; state.input(buffer.pad_with::().unwrap()); let mut last = BBGenericArray::default(); - BigEndian::write_u64(&mut last[56..], len); + last[56..].copy_from_slice(&len.to_be_bytes()); state.input(&last); } let finalized = self.state.finalize(); - DGenericArray::clone_from_slice(&finalized[(128 - $OutputBytes::to_usize())..]) + out.copy_from_slice(&finalized[(128 - $OutputBytes::to_usize())..]); } } diff --git a/hashes/skein/CHANGELOG.md b/hashes/skein/CHANGELOG.md new file mode 100644 index 00000000..44b58459 --- /dev/null +++ b/hashes/skein/CHANGELOG.md @@ -0,0 +1,9 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +### Changed +- Update `cipher` dependency: 0.2 -> 0.3. diff --git a/hashes/skein/Cargo.toml b/hashes/skein/Cargo.toml index e5a85ea8..cb4ec249 100644 --- a/hashes/skein/Cargo.toml +++ b/hashes/skein/Cargo.toml @@ -1,18 +1,20 @@ [package] name = "skein-hash" -version = "0.3.0" +version = "0.3.2" authors = ["The CryptoCorrosion Contributors"] license = "MIT/Apache-2.0" description = "Skein hash functions" repository = "https://github.com/cryptocorrosion/hashes" keywords = ["crypto", "skein", "hash", "digest"] categories = ["cryptography", "no-std"] +edition = "2021" +rust-version = "1.61" [dependencies] -block-buffer = "0.7" -block-padding = "0.1.0" -digest = "0.8" -threefish-cipher = "0.3" +block-buffer = { version = "0.9", features = ["block-padding"] } +digest = "0.9" +threefish-cipher = "0.4" +cipher = "0.3" [dev-dependencies] -digest = { version = "0.8", features = ["dev"] } +digest = { version = "0.9", features = ["dev"] } diff --git a/hashes/skein/src/lib.rs b/hashes/skein/src/lib.rs index edf82eef..0252aac0 100644 --- a/hashes/skein/src/lib.rs +++ b/hashes/skein/src/lib.rs @@ -3,22 +3,22 @@ #![no_std] extern crate block_buffer; -extern crate block_padding; pub extern crate digest; extern crate threefish_cipher; pub use digest::generic_array::GenericArray; pub use digest::Digest; -use block_buffer::byteorder::{ByteOrder, LE}; +use block_buffer::block_padding::ZeroPadding; use block_buffer::BlockBuffer; -use block_padding::ZeroPadding; +use cipher::{BlockCipher, BlockEncrypt}; use digest::generic_array::typenum::{NonZero, PartialDiv, Unsigned, U128, U32, U64, U8}; use digest::generic_array::ArrayLength; -use threefish_cipher::{BlockCipher, Threefish1024, Threefish256, Threefish512}; +use threefish_cipher::{Threefish1024, Threefish256, Threefish512}; /// N word buffer. #[derive(Copy, Clone)] +#[repr(C)] union Block where N: ArrayLength, @@ -44,13 +44,45 @@ where } fn as_byte_array(&self) -> &GenericArray { + // SAFETY: Both fields of this union have the same layout and bit + // validity, so it's okay to treat either field as the other field's + // type. Since the union is `repr(C)`, they both live in the same byte + // range. (One exception: They don't have the same alignment, but the + // alignment of the entire union is the greater of their alignments, so + // this isn't an issue.) unsafe { &self.bytes } } fn as_byte_array_mut(&mut self) -> &mut GenericArray { + // SAFETY: Both fields of this union have the same layout and bit + // validity, so it's okay to treat either field as the other field's + // type. Since the union is `repr(C)`, they both live in the same byte + // range. (One exception: They don't have the same alignment, but the + // alignment of the entire union is the greater of their alignments, so + // this isn't an issue.) unsafe { &mut self.bytes } } + fn as_word_array(&self) -> &GenericArray>::Output> { + // SAFETY: Both fields of this union have the same layout and bit + // validity, so it's okay to treat either field as the other field's + // type. Since the union is `repr(C)`, they both live in the same byte + // range. (One exception: They don't have the same alignment, but the + // alignment of the entire union is the greater of their alignments, so + // this isn't an issue.) + unsafe { &self.words } + } + + fn as_word_array_mut(&mut self) -> &mut GenericArray>::Output> { + // SAFETY: Both fields of this union have the same layout and bit + // validity, so it's okay to treat either field as the other field's + // type. Since the union is `repr(C)`, they both live in the same byte + // range. (One exception: They don't have the same alignment, but the + // alignment of the entire union is the greater of their alignments, so + // this isn't an issue.) + unsafe { &mut self.words } + } + fn from_byte_array(block: &GenericArray) -> Self { Block { bytes: *block } } @@ -82,10 +114,7 @@ where type Output = Block; fn bitxor(mut self, rhs: Block) -> Self::Output { // XOR is endian-agnostic - for (s, r) in unsafe { &mut self.words } - .iter_mut() - .zip(unsafe { &rhs.words }) - { + for (s, r) in self.as_word_array_mut().iter_mut().zip(rhs.as_word_array()) { *s ^= *r; } self @@ -176,9 +205,9 @@ macro_rules! define_hasher { Block::default(), ); let mut cfg = GenericArray::::default(); - LE::write_u64(&mut cfg[..8], SCHEMA_VER); - LE::write_u64(&mut cfg[8..16], N::to_u64() * 8); - LE::write_u64(&mut cfg[16..24], CFG_TREE_INFO_SEQUENTIAL); + cfg[..8].copy_from_slice(&SCHEMA_VER.to_le_bytes()); + cfg[8..16].copy_from_slice(&(N::to_u64() * 8).to_le_bytes()); + cfg[16..24].copy_from_slice(&CFG_TREE_INFO_SEQUENTIAL.to_le_bytes()); Self::process_block(&mut state, &cfg, CFG_STR_LEN); // The chaining vars ctx->X are now initialized for the given hashBitLen. @@ -200,11 +229,11 @@ macro_rules! define_hasher { type BlockSize = <$threefish as BlockCipher>::BlockSize; } - impl digest::Input for $name + impl digest::Update for $name where N: Unsigned + ArrayLength + NonZero + Default, { - fn input>(&mut self, data: T) { + fn update(&mut self, data: impl AsRef<[u8]>) { let buffer = &mut self.buffer; let state = &mut self.state; buffer.input_lazy(data.as_ref(), |block| { @@ -213,32 +242,30 @@ macro_rules! define_hasher { } } - impl digest::FixedOutput for $name + impl digest::FixedOutputDirty for $name where N: Unsigned + ArrayLength + NonZero + Default, { type OutputSize = N; - fn fixed_result(mut self) -> GenericArray { + fn finalize_into_dirty(&mut self, output: &mut GenericArray) { self.state.t.1 |= T1_FLAG_FINAL; let pos = self.buffer.position(); let final_block = self.buffer.pad_with::().unwrap(); Self::process_block(&mut self.state, final_block, pos); // run Threefish in "counter mode" to generate output - let mut output = GenericArray::default(); for (i, chunk) in output.chunks_mut($state_bits / 8).enumerate() { let mut ctr = State::new( T1_FLAG_FIRST | T1_BLK_TYPE_OUT | T1_FLAG_FINAL, self.state.x, ); let mut b = GenericArray::::default(); - LE::write_u64(&mut b[..8], i as u64); + b[..8].copy_from_slice(&(i as u64).to_le_bytes()); Self::process_block(&mut ctr, &b, 8); let n = chunk.len(); chunk.copy_from_slice(&ctr.x.bytes()[..n]); } - output } } diff --git a/stream-ciphers/chacha/CHANGELOG.md b/stream-ciphers/chacha/CHANGELOG.md new file mode 100644 index 00000000..44b58459 --- /dev/null +++ b/stream-ciphers/chacha/CHANGELOG.md @@ -0,0 +1,9 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [Unreleased] +### Changed +- Update `cipher` dependency: 0.2 -> 0.3. diff --git a/stream-ciphers/chacha/Cargo.toml b/stream-ciphers/chacha/Cargo.toml index 0fccf0d4..591c1733 100644 --- a/stream-ciphers/chacha/Cargo.toml +++ b/stream-ciphers/chacha/Cargo.toml @@ -1,29 +1,30 @@ [package] name = "c2-chacha" -version = "0.2.3" +version = "0.3.3" authors = ["The CryptoCorrosion Contributors"] license = "MIT/Apache-2.0" -edition = "2018" +edition = "2021" description = "The ChaCha family of stream ciphers" repository = "https://github.com/cryptocorrosion/cryptocorrosion" keywords = ["chacha", "chacha20", "xchacha20", "cipher", "crypto"] categories = ["cryptography", "no-std"] readme = "README.md" documentation = "https://docs.rs/c2-chacha" +rust-version = "1.61" [dependencies] -byteorder = { version = "1.3", optional = true } -ppv-lite86 = { package = "ppv-lite86", version = "0.2.6", default-features = false } -stream-cipher = { version = "0.3", optional = true } +ppv-lite86 = { package = "ppv-lite86", version = "0.2.14", default-features = false } +cipher = { version = "0.3", optional = true } [dev-dependencies] -hex-literal = "0.2" +hex-literal = "0.3" [features] -default = ["std", "simd", "rustcrypto_api"] +default = ["std", "rustcrypto_api"] std = ["ppv-lite86/std"] -rustcrypto_api = ["stream-cipher", "byteorder"] -simd = ["ppv-lite86/simd"] +rustcrypto_api = ["cipher"] +no_simd = ["ppv-lite86/no_simd"] +simd = [] # deprecated [badges] travis-ci = { repository = "cryptocorrosion/cryptocorrosion" } diff --git a/stream-ciphers/chacha/benches/chacha20.rs b/stream-ciphers/chacha/benches/chacha20.rs index b8d7c228..df70c1db 100644 --- a/stream-ciphers/chacha/benches/chacha20.rs +++ b/stream-ciphers/chacha/benches/chacha20.rs @@ -18,3 +18,15 @@ pub fn stream_10k(b: &mut Bencher) { }); b.bytes = 10240; } + +#[bench] +pub fn stream_narrow_10k(b: &mut Bencher) { + let mut state = ChaCha20::new_var(&[0; 32], &[0; 8]).unwrap(); + let mut result = [0; 192]; + b.iter(|| { + for _ in 0..10 { + state.apply_keystream(&mut result) + } + }); + b.bytes = 1920; +} diff --git a/stream-ciphers/chacha/src/guts.rs b/stream-ciphers/chacha/src/guts.rs index 394aab48..cf0dd000 100644 --- a/stream-ciphers/chacha/src/guts.rs +++ b/stream-ciphers/chacha/src/guts.rs @@ -1,8 +1,10 @@ #[cfg(feature = "rustcrypto_api")] -pub use stream_cipher::generic_array; +pub use cipher::generic_array; pub use ppv_lite86::Machine; -use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4}; +use ppv_lite86::{ + vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext, +}; pub(crate) const BLOCK: usize = 64; pub(crate) const BLOCK64: u64 = BLOCK as u64; @@ -11,14 +13,16 @@ const BUFBLOCKS: u64 = 1 << LOG2_BUFBLOCKS; pub(crate) const BUFSZ64: u64 = BLOCK64 * BUFBLOCKS; pub(crate) const BUFSZ: usize = BUFSZ64 as usize; -#[derive(Clone)] +/// Parameters of a ChaCha stream, including fixed parameters and current position. +#[derive(Clone, PartialEq, Eq)] pub struct ChaCha { pub(crate) b: vec128_storage, pub(crate) c: vec128_storage, pub(crate) d: vec128_storage, } -#[derive(Clone)] +/// Working state of a ChaCha stream. +#[derive(Clone, PartialEq, Eq)] pub struct State { pub(crate) a: V, pub(crate) b: V, @@ -41,23 +45,56 @@ pub(crate) fn round(mut x: State) -> State { #[inline(always)] pub(crate) fn diagonalize(mut x: State) -> State { - x.b = x.b.shuffle_lane_words3012(); - x.c = x.c.shuffle_lane_words2301(); - x.d = x.d.shuffle_lane_words1230(); + // Since b has the critical data dependency, avoid rotating b to hide latency. + // + // The order of these statements is important for performance on pre-AVX2 Intel machines, which + // are throughput-bound and operating near their superscalar limits during refill_wide. The + // permutations here and in undiagonalize have been found in testing on Nehalem to be optimal. + x.a = x.a.shuffle_lane_words1230(); + x.c = x.c.shuffle_lane_words3012(); + x.d = x.d.shuffle_lane_words2301(); x } + #[inline(always)] pub(crate) fn undiagonalize(mut x: State) -> State { - x.b = x.b.shuffle_lane_words1230(); - x.c = x.c.shuffle_lane_words2301(); - x.d = x.d.shuffle_lane_words3012(); + // The order of these statements is magic. See comment in diagonalize. + x.c = x.c.shuffle_lane_words1230(); + x.d = x.d.shuffle_lane_words2301(); + x.a = x.a.shuffle_lane_words3012(); x } impl ChaCha { - #[inline(always)] pub fn new(key: &[u8; 32], nonce: &[u8]) -> Self { - init_chacha(key, nonce) + let ctr_nonce = [ + 0, + if nonce.len() == 12 { + read_u32le(&nonce[0..4]) + } else { + 0 + }, + read_u32le(&nonce[nonce.len() - 8..nonce.len() - 4]), + read_u32le(&nonce[nonce.len() - 4..]), + ]; + let key0 = [ + read_u32le(&key[0..4]), + read_u32le(&key[4..8]), + read_u32le(&key[8..12]), + read_u32le(&key[12..16]), + ]; + let key1 = [ + read_u32le(&key[16..20]), + read_u32le(&key[20..24]), + read_u32le(&key[24..28]), + read_u32le(&key[28..32]), + ]; + + ChaCha { + b: key0.into(), + c: key1.into(), + d: ctr_nonce.into(), + } } #[inline(always)] @@ -120,17 +157,86 @@ impl ChaCha { refill_narrow_rounds(self, drounds) } - #[inline(always)] + #[inline] pub fn set_stream_param(&mut self, param: u32, value: u64) { - set_stream_param(self, param, value) + let mut d: [u32; 4] = self.d.into(); + let p0 = ((param << 1) | 1) as usize; + let p1 = (param << 1) as usize; + d[p0] = (value >> 32) as u32; + d[p1] = value as u32; + self.d = d.into(); } - #[inline(always)] + #[inline] pub fn get_stream_param(&self, param: u32) -> u64 { - get_stream_param(self, param) + let d: [u32; 4] = self.d.into(); + let p0 = ((param << 1) | 1) as usize; + let p1 = (param << 1) as usize; + ((d[p0] as u64) << 32) | d[p1] as u64 + } + + /// Return whether rhs represents the same stream, irrespective of current 32-bit position. + #[inline] + pub fn stream32_eq(&self, rhs: &Self) -> bool { + let self_d: [u32; 4] = self.d.into(); + let rhs_d: [u32; 4] = rhs.d.into(); + self.b == rhs.b + && self.c == rhs.c + && self_d[3] == rhs_d[3] + && self_d[2] == rhs_d[2] + && self_d[1] == rhs_d[1] + } + + /// Return whether rhs represents the same stream, irrespective of current 64-bit position. + #[inline] + pub fn stream64_eq(&self, rhs: &Self) -> bool { + let self_d: [u32; 4] = self.d.into(); + let rhs_d: [u32; 4] = rhs.d.into(); + self.b == rhs.b && self.c == rhs.c && self_d[3] == rhs_d[3] && self_d[2] == rhs_d[2] } } +// This implementation is platform-independent. +#[inline(always)] +#[cfg(target_endian = "big")] +fn add_pos(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 { + let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; + let pos = pos0.wrapping_add(i); + d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0) +} +#[inline(always)] +#[cfg(target_endian = "big")] +fn d0123(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { + let d0: Mach::u32x4 = m.unpack(d); + let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64; + pos = pos.wrapping_add(1); + let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); + pos = pos.wrapping_add(1); + let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); + pos = pos.wrapping_add(1); + let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); + Mach::u32x4x4::from_lanes([d0, d1, d2, d3]) +} + +// Pos is packed into the state vectors as a little-endian u64, +// so on LE platforms we can use native vector ops to increment it. +#[inline(always)] +#[cfg(target_endian = "little")] +fn add_pos(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 { + let d0: Mach::u64x2 = m.unpack(d.into()); + let incr = m.vec([i, 0]); + m.unpack((d0 + incr).into()) +} +#[inline(always)] +#[cfg(target_endian = "little")] +fn d0123(m: Mach, d: vec128_storage) -> Mach::u32x4x4 { + let d0: Mach::u64x2 = m.unpack(d); + let incr = + Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]); + m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into()) +} + +#[allow(clippy::many_single_char_names)] #[inline(always)] fn refill_wide_impl( m: Mach, @@ -139,55 +245,30 @@ fn refill_wide_impl( out: &mut [u8; BUFSZ], ) { let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); - let mut pos = state.pos64(m); - let d0: Mach::u32x4 = m.unpack(state.d); - pos += 1; - let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos += 1; - let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos += 1; - let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - let b = m.unpack(state.b); let c = m.unpack(state.c); let mut x = State { a: Mach::u32x4x4::from_lanes([k, k, k, k]), b: Mach::u32x4x4::from_lanes([b, b, b, b]), c: Mach::u32x4x4::from_lanes([c, c, c, c]), - d: m.unpack(Mach::u32x4x4::from_lanes([d0, d1, d2, d3]).into()), + d: d0123(m, state.d), }; for _ in 0..drounds { x = round(x); x = undiagonalize(round(diagonalize(x))); } - let mut pos = state.pos64(m); - let d0: Mach::u32x4 = m.unpack(state.d); - pos += 1; - let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos += 1; - let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos += 1; - let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - pos += 1; - let d4 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0); - - let (a, b, c, d) = ( - x.a.to_lanes(), - x.b.to_lanes(), - x.c.to_lanes(), - x.d.to_lanes(), - ); + let kk = Mach::u32x4x4::from_lanes([k, k, k, k]); let sb = m.unpack(state.b); + let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]); let sc = m.unpack(state.c); - let sd = [m.unpack(state.d), d1, d2, d3]; - state.d = d4.into(); - let mut words = out.chunks_exact_mut(16); - for ((((&a, &b), &c), &d), &sd) in a.iter().zip(&b).zip(&c).zip(&d).zip(&sd) { - (a + k).write_le(words.next().unwrap()); - (b + sb).write_le(words.next().unwrap()); - (c + sc).write_le(words.next().unwrap()); - (d + sd).write_le(words.next().unwrap()); - } + let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]); + let sd = d0123(m, state.d); + let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd); + results.0.write_le(&mut out[0..64]); + results.1.write_le(&mut out[64..128]); + results.2.write_le(&mut out[128..192]); + results.3.write_le(&mut out[192..256]); + state.d = add_pos(m, sd.to_lanes()[0], 4).into(); } dispatch!(m, Mach, { @@ -196,7 +277,7 @@ dispatch!(m, Mach, { } }); -/// Refill the buffer from a single-block round, updating the block count. +// Refill the buffer from a single-block round, updating the block count. dispatch_light128!(m, Mach, { fn refill_narrow(state: &mut ChaCha, drounds: u32, out: &mut [u8; BLOCK]) { let x = refill_narrow_rounds(state, drounds); @@ -211,8 +292,8 @@ dispatch_light128!(m, Mach, { } }); -/// Single-block, rounds-only; shared by try_apply_keystream for tails shorter than BUFSZ -/// and XChaCha's setup step. +// Single-block, rounds-only; shared by try_apply_keystream for tails shorter than BUFSZ +// and XChaCha's setup step. dispatch!(m, Mach, { fn refill_narrow_rounds(state: &mut ChaCha, drounds: u32) -> State { let k: Mach::u32x4 = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]); @@ -235,50 +316,11 @@ dispatch!(m, Mach, { } }); -dispatch_light128!(m, Mach, { - fn set_stream_param(state: &mut ChaCha, param: u32, value: u64) { - let d: Mach::u32x4 = m.unpack(state.d); - state.d = d - .insert((value >> 32) as u32, (param << 1) | 1) - .insert(value as u32, param << 1) - .into(); - } -}); - -dispatch_light128!(m, Mach, { - fn get_stream_param(state: &ChaCha, param: u32) -> u64 { - let d: Mach::u32x4 = m.unpack(state.d); - ((d.extract((param << 1) | 1) as u64) << 32) | d.extract(param << 1) as u64 - } -}); - fn read_u32le(xs: &[u8]) -> u32 { assert_eq!(xs.len(), 4); u32::from(xs[0]) | (u32::from(xs[1]) << 8) | (u32::from(xs[2]) << 16) | (u32::from(xs[3]) << 24) } -dispatch_light128!(m, Mach, { - fn init_chacha(key: &[u8; 32], nonce: &[u8]) -> ChaCha { - let ctr_nonce = [ - 0, - if nonce.len() == 12 { - read_u32le(&nonce[0..4]) - } else { - 0 - }, - read_u32le(&nonce[nonce.len() - 8..nonce.len() - 4]), - read_u32le(&nonce[nonce.len() - 4..]), - ]; - let key0: Mach::u32x4 = m.read_le(&key[..16]); - let key1: Mach::u32x4 = m.read_le(&key[16..]); - ChaCha { - b: key0.into(), - c: key1.into(), - d: ctr_nonce.into(), - } - } -}); - dispatch_light128!(m, Mach, { fn init_chacha_x(key: &[u8; 32], nonce: &[u8; 24], rounds: u32) -> ChaCha { let key0: Mach::u32x4 = m.read_le(&key[..16]); @@ -297,3 +339,25 @@ dispatch_light128!(m, Mach, { state } }); + +#[cfg(test)] +mod tests { + use super::*; + + /// Basic check that streamXX_eq is block-count invariant + #[test] + fn test_stream_eq() { + let key = hex!("fa44478c59ca70538e3549096ce8b523232c50d9e8e8d10c203ef6c8d07098a5"); + let nonce = hex!("8d3a0d6d7827c00701020304"); + let mut a = ChaCha::new(&key, &nonce); + let b = a.clone(); + let mut out = [0u8; BLOCK]; + assert!(a == b); + assert!(a.stream32_eq(&b)); + assert!(a.stream64_eq(&b)); + a.refill(0, &mut out); + assert!(a != b); + assert!(a.stream32_eq(&b)); + assert!(a.stream64_eq(&b)); + } +} diff --git a/stream-ciphers/chacha/src/lib.rs b/stream-ciphers/chacha/src/lib.rs index 363a1a7b..f064a271 100644 --- a/stream-ciphers/chacha/src/lib.rs +++ b/stream-ciphers/chacha/src/lib.rs @@ -4,6 +4,8 @@ //! //! Stream-cipher usage: //! ``` +//! #[cfg(features = "std")] +//! fn demo() { //! extern crate c2_chacha; //! //! use c2_chacha::stream_cipher::{NewStreamCipher, SyncStreamCipher, SyncStreamCipherSeek}; @@ -26,6 +28,7 @@ //! for chunk in buffer.chunks_mut(3) { //! cipher.apply_keystream(chunk); //! } +//! } //! ``` #![cfg_attr(not(feature = "std"), no_std)] @@ -42,4 +45,6 @@ pub mod guts; #[cfg(feature = "rustcrypto_api")] mod rustcrypto_impl; #[cfg(feature = "rustcrypto_api")] -pub use self::rustcrypto_impl::{stream_cipher, ChaCha12, ChaCha20, ChaCha8, Ietf, XChaCha20}; +pub use self::rustcrypto_impl::{ + ChaCha12, ChaCha20, ChaCha8, Ietf, XChaCha12, XChaCha20, XChaCha8, +}; diff --git a/stream-ciphers/chacha/src/rustcrypto_impl.rs b/stream-ciphers/chacha/src/rustcrypto_impl.rs index ce74a63f..2e4e2188 100644 --- a/stream-ciphers/chacha/src/rustcrypto_impl.rs +++ b/stream-ciphers/chacha/src/rustcrypto_impl.rs @@ -1,10 +1,10 @@ -use byteorder::{ByteOrder, LE}; -use core::cmp; use crate::guts::generic_array::typenum::{Unsigned, U10, U12, U24, U32, U4, U6, U8}; use crate::guts::generic_array::{ArrayLength, GenericArray}; use crate::guts::{ChaCha, Machine, BLOCK, BLOCK64, BUFSZ}; -pub use stream_cipher; -use stream_cipher::{LoopError, NewStreamCipher, SyncStreamCipher, SyncStreamCipherSeek}; +use cipher::errors::{LoopError, OverflowError}; +use cipher::{NewCipher, SeekNum, StreamCipher, StreamCipherSeek}; +use core::cmp; +use core::convert::TryInto; const BIG_LEN: u64 = 0; const SMALL_LEN: u64 = 1 << 32; @@ -176,7 +176,7 @@ impl ChaChaAny { } } -impl NewStreamCipher for ChaChaAny +impl NewCipher for ChaChaAny where NonceSize: Unsigned + ArrayLength + Default, Rounds: Default, @@ -192,7 +192,7 @@ where } } -impl NewStreamCipher for ChaChaAny { +impl NewCipher for ChaChaAny { type KeySize = U32; type NonceSize = U24; #[inline] @@ -204,18 +204,20 @@ impl NewStreamCipher for ChaChaAny { } } -impl SyncStreamCipherSeek for ChaChaAny { +impl StreamCipherSeek for ChaChaAny { #[inline] - fn current_pos(&self) -> u64 { + fn try_current_pos(&self) -> Result { unimplemented!() } #[inline(always)] - fn seek(&mut self, ct: u64) { - Self::seek(self, ct) + fn try_seek(&mut self, pos: T) -> Result<(), LoopError> { + pos.try_into() + .map_err(|_| LoopError) + .map(|ct| Self::seek(self, ct)) } } -impl SyncStreamCipher for ChaChaAny { +impl StreamCipher for ChaChaAny { #[inline] fn try_apply_keystream(&mut self, data: &mut [u8]) -> Result<(), LoopError> { Self::try_apply_keystream(self, data).map_err(|_| LoopError) @@ -241,12 +243,12 @@ dispatch_light128!(m, Mach, { let ctr_nonce = [ 0, if nonce.len() == 12 { - LE::read_u32(&nonce[0..4]) + u32::from_le_bytes(nonce[0..4].try_into().unwrap()) } else { 0 }, - LE::read_u32(&nonce[nonce.len() - 8..nonce.len() - 4]), - LE::read_u32(&nonce[nonce.len() - 4..]), + u32::from_le_bytes(nonce[nonce.len() - 8..nonce.len() - 4].try_into().unwrap()), + u32::from_le_bytes(nonce[nonce.len() - 4..].try_into().unwrap()), ]; let key0: Mach::u32x4 = m.read_le(&key[..16]); let key1: Mach::u32x4 = m.read_le(&key[16..]); @@ -276,8 +278,8 @@ dispatch_light128!(m, Mach, { let ctr_nonce1 = [ 0, 0, - LE::read_u32(&nonce[16..20]), - LE::read_u32(&nonce[20..24]), + u32::from_le_bytes(nonce[16..20].try_into().unwrap()), + u32::from_le_bytes(nonce[20..24].try_into().unwrap()), ]; state.b = x.a; state.c = x.d; @@ -288,12 +290,18 @@ dispatch_light128!(m, Mach, { /// IETF RFC 7539 ChaCha. Unsuitable for messages longer than 256 GiB. pub type Ietf = ChaChaAny; -/// ChaCha20, as used in several standards; from Bernstein's original publication. -pub type ChaCha20 = ChaChaAny; -/// Similar to ChaCha20, but with fewer rounds for higher performance. -pub type ChaCha12 = ChaChaAny; /// Similar to ChaCha20, but with fewer rounds for higher performance. pub type ChaCha8 = ChaChaAny; +/// Similar to ChaCha20, but with fewer rounds for higher performance. +pub type ChaCha12 = ChaChaAny; +/// ChaCha20, as used in several standards; from Bernstein's original publication. +pub type ChaCha20 = ChaChaAny; +/// Constructed analogously to XChaCha20, but with fewer rounds for higher performance; +/// mixes during initialization to support both a long nonce and a full-length (64-bit) block counter. +pub type XChaCha8 = ChaChaAny; +/// Constructed analogously to XChaCha20, but with fewer rounds for higher performance; +/// mixes during initialization to support both a long nonce and a full-length (64-bit) block counter. +pub type XChaCha12 = ChaChaAny; /// Constructed analogously to XSalsa20; mixes during initialization to support both a long nonce /// and a full-length (64-bit) block counter. pub type XChaCha20 = ChaChaAny; diff --git a/test_alt_simd.sh b/test_alt_simd.sh deleted file mode 100755 index 52c05d81..00000000 --- a/test_alt_simd.sh +++ /dev/null @@ -1,20 +0,0 @@ -#!/bin/sh - -if [ -n "$FAILFAST" ]; then set -e; fi - -# no SIMD yet: -# - hashes/threefish -# - block-ciphers/skein - -# not ported to crypto-simd API yet: -# - hashes/groestl - -echo BACKEND ppv-null -cd hashes/blake; cargo test --no-default-features; cd ../.. -cd hashes/jh; cargo test --no-default-features; cd ../.. -cd stream-ciphers/chacha; cargo test --no-default-features; cd ../.. - -echo BACKEND packed_simd -cd hashes/blake; cargo test --no-default-features --features packed_simd,std; cd ../.. -cd hashes/jh; cargo test -p jh-x86_64 --no-default-features --features packed_simd,std; cd ../.. -cd stream-ciphers/chacha; cargo test --no-default-features --features packed_simd,std; cd ../.. diff --git a/test_stable.sh b/test_stable.sh index b0048dc9..fda2e9d7 100755 --- a/test_stable.sh +++ b/test_stable.sh @@ -1,4 +1,4 @@ #!/bin/sh +cargo +1.32.0-x86_64-unknown-linux-gnu test cargo +stable test -cargo +1.31.1-x86_64-unknown-linux-gnu test diff --git a/unreleased-changes.sh b/unreleased-changes.sh new file mode 100755 index 00000000..e7d82105 --- /dev/null +++ b/unreleased-changes.sh @@ -0,0 +1,23 @@ +#!/bin/sh + +CRATE_TAGS=$(git tag | sed 's/-[^-]*$//' | uniq) +CARGOS=$(echo */*/Cargo.toml) + +for TAG in $CRATE_TAGS; do + LATEST_TAG=$(git tag | grep $TAG | sort -V | tail -n1) + FOUND_CARGO= + for CARGO in $CARGOS; do + # fixups for tag names that don't match crate names + CRATE=$(echo $TAG | sed -e 's/^jh$/jh-x86_64/' -e 's/^blake$/blake-hash/' -e 's/^groestl$/groestl-aesni/') + if grep -q "name = \"$CRATE\"" $CARGO; then + FOUND_CARGO=1 + DIR=$(dirname $CARGO) + git log --color=always --stat $LATEST_TAG..HEAD -- $DIR | less -R + break + fi + done + if [ -z "$FOUND_CARGO" ]; then + echo "Couldn't find a Cargo.toml for $TAG!" + fi +done + diff --git a/utils-simd/crypto-simd/Cargo.toml b/utils-simd/crypto-simd/Cargo.toml index 9b3858ef..b07c3e7c 100644 --- a/utils-simd/crypto-simd/Cargo.toml +++ b/utils-simd/crypto-simd/Cargo.toml @@ -2,12 +2,13 @@ name = "crypto-simd" version = "0.2.0" authors = ["The CryptoCorrosion Contributors"] -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" description = "Crypto-oriented SIMD wrapper abstracting over multiple backends" repository = "https://github.com/cryptocorrosion/cryptocorrosion" keywords = ["crypto", "simd"] categories = ["cryptography", "no-std"] +rust-version = "1.61" [dependencies] packed_simd_crate = { package = "packed_simd", version = "0.3", optional = true } diff --git a/utils-simd/ppv-lite86/CHANGELOG.md b/utils-simd/ppv-lite86/CHANGELOG.md new file mode 100644 index 00000000..6e34be39 --- /dev/null +++ b/utils-simd/ppv-lite86/CHANGELOG.md @@ -0,0 +1,10 @@ +# Changelog +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), +and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). + +## [0.2.16] +### Added +- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86. +- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays. diff --git a/utils-simd/ppv-lite86/Cargo.toml b/utils-simd/ppv-lite86/Cargo.toml index b1c0c0f5..feb8cc2c 100644 --- a/utils-simd/ppv-lite86/Cargo.toml +++ b/utils-simd/ppv-lite86/Cargo.toml @@ -1,20 +1,23 @@ [package] name = "ppv-lite86" -version = "0.2.6" +version = "0.2.20" authors = ["The CryptoCorrosion Contributors"] -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" description = "Implementation of the crypto-simd API for x86" repository = "https://github.com/cryptocorrosion/cryptocorrosion" keywords = ["crypto", "simd", "x86"] categories = ["cryptography", "no-std"] +rust-version = "1.61" [dependencies] +zerocopy = { version = "0.7", features = ["simd", "derive"] } [badges] travis-ci = { repository = "cryptocorrosion/cryptocorrosion" } [features] -default = ["std", "simd"] +default = ["std"] std = [] -simd = [] +simd = [] # deprecated +no_simd = [] diff --git a/utils-simd/ppv-lite86/src/generic.rs b/utils-simd/ppv-lite86/src/generic.rs index 2d0a74cf..8989482a 100644 --- a/utils-simd/ppv-lite86/src/generic.rs +++ b/utils-simd/ppv-lite86/src/generic.rs @@ -1,22 +1,54 @@ #![allow(non_camel_case_types)] -use core::ops::*; use crate::soft::{x2, x4}; use crate::types::*; +use core::ops::*; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; -#[derive(Clone, Copy)] +#[repr(C)] +#[derive(Clone, Copy, FromBytes, AsBytes, FromZeroes)] pub union vec128_storage { d: [u32; 4], q: [u64; 2], - o: [u128; 1], } impl From<[u32; 4]> for vec128_storage { - #[inline] + #[inline(always)] fn from(d: [u32; 4]) -> Self { Self { d } } } -#[derive(Clone, Copy)] +impl From for [u32; 4] { + #[inline(always)] + fn from(d: vec128_storage) -> Self { + unsafe { d.d } + } +} +impl From<[u64; 2]> for vec128_storage { + #[inline(always)] + fn from(q: [u64; 2]) -> Self { + Self { q } + } +} +impl From for [u64; 2] { + #[inline(always)] + fn from(q: vec128_storage) -> Self { + unsafe { q.q } + } +} +impl Default for vec128_storage { + #[inline(always)] + fn default() -> Self { + Self { q: [0, 0] } + } +} +impl Eq for vec128_storage {} +impl PartialEq for vec128_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.q == rhs.q } + } +} +#[derive(Clone, Copy, PartialEq, Eq, Default)] pub struct vec256_storage { v128: [vec128_storage; 2], } @@ -30,7 +62,23 @@ impl vec256_storage { self.v128 } } -#[derive(Clone, Copy)] +impl From for [u64; 4] { + #[inline(always)] + fn from(q: vec256_storage) -> Self { + let [a, b]: [u64; 2] = q.v128[0].into(); + let [c, d]: [u64; 2] = q.v128[1].into(); + [a, b, c, d] + } +} +impl From<[u64; 4]> for vec256_storage { + #[inline(always)] + fn from([a, b, c, d]: [u64; 4]) -> Self { + Self { + v128: [[a, b].into(), [c, d].into()], + } + } +} +#[derive(Clone, Copy, PartialEq, Eq, Default)] pub struct vec512_storage { v128: [vec128_storage; 4], } @@ -45,6 +93,7 @@ impl vec512_storage { } } +#[inline(always)] fn dmap(t: T, f: F) -> T where T: Store + Into, @@ -78,6 +127,7 @@ where unsafe { T::unpack(d) } } +#[inline(always)] fn qmap(t: T, f: F) -> T where T: Store + Into, @@ -91,6 +141,7 @@ where unsafe { T::unpack(q) } } +#[inline(always)] fn qmap2(a: T, b: T, f: F) -> T where T: Store + Into, @@ -106,17 +157,29 @@ where unsafe { T::unpack(q) } } +#[inline(always)] +fn o_of_q(q: [u64; 2]) -> u128 { + u128::from(q[0]) | (u128::from(q[1]) << 64) +} + +#[inline(always)] +fn q_of_o(o: u128) -> [u64; 2] { + [o as u64, (o >> 64) as u64] +} + +#[inline(always)] fn omap(a: T, f: F) -> T where T: Store + Into, F: Fn(u128) -> u128, { let a: vec128_storage = a.into(); - let ao = unsafe { a.o }; - let o = vec128_storage { o: [f(ao[0])] }; + let ao = o_of_q(unsafe { a.q }); + let o = vec128_storage { q: q_of_o(f(ao)) }; unsafe { T::unpack(o) } } +#[inline(always)] fn omap2(a: T, b: T, f: F) -> T where T: Store + Into, @@ -124,10 +187,10 @@ where { let a: vec128_storage = a.into(); let b: vec128_storage = b.into(); - let ao = unsafe { a.o }; - let bo = unsafe { b.o }; + let ao = o_of_q(unsafe { a.q }); + let bo = o_of_q(unsafe { b.q }); let o = vec128_storage { - o: [f(ao[0], bo[0])], + q: q_of_o(f(ao, bo)), }; unsafe { T::unpack(o) } } @@ -200,39 +263,39 @@ macro_rules! impl_bitops { } impl Swap64 for $vec { - #[inline] + #[inline(always)] fn swap1(self) -> Self { qmap(self, |x| { ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1) }) } - #[inline] + #[inline(always)] fn swap2(self) -> Self { qmap(self, |x| { ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2) }) } - #[inline] + #[inline(always)] fn swap4(self) -> Self { qmap(self, |x| { ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4) }) } - #[inline] + #[inline(always)] fn swap8(self) -> Self { qmap(self, |x| { ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8) }) } - #[inline] + #[inline(always)] fn swap16(self) -> Self { dmap(self, |x| x.rotate_left(16)) } - #[inline] + #[inline(always)] fn swap32(self) -> Self { qmap(self, |x| x.rotate_left(32)) } - #[inline] + #[inline(always)] fn swap64(self) -> Self { omap(self, |x| (x << 64) | (x >> 64)) } @@ -244,82 +307,83 @@ impl_bitops!(u64x2_generic); impl_bitops!(u128x1_generic); impl RotateEachWord32 for u32x4_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { dmap(self, |x| x.rotate_right(7)) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { dmap(self, |x| x.rotate_right(8)) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { dmap(self, |x| x.rotate_right(11)) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { dmap(self, |x| x.rotate_right(12)) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { dmap(self, |x| x.rotate_right(16)) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { dmap(self, |x| x.rotate_right(20)) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { dmap(self, |x| x.rotate_right(24)) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { dmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord32 for u64x2_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { qmap(self, |x| x.rotate_right(7)) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { qmap(self, |x| x.rotate_right(8)) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { qmap(self, |x| x.rotate_right(11)) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { qmap(self, |x| x.rotate_right(12)) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { qmap(self, |x| x.rotate_right(16)) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { qmap(self, |x| x.rotate_right(20)) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { qmap(self, |x| x.rotate_right(24)) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { qmap(self, |x| x.rotate_right(25)) } } impl RotateEachWord64 for u64x2_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right32(self) -> Self { qmap(self, |x| x.rotate_right(32)) } } // workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web) +#[inline(always)] fn rotate_u128_right(x: u128, i: u32) -> u128 { (x >> i) | (x << (128 - i)) } @@ -330,41 +394,41 @@ fn test_rotate_u128() { } impl RotateEachWord32 for u128x1_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right7(self) -> Self { Self([rotate_u128_right(self.0[0], 7)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right8(self) -> Self { Self([rotate_u128_right(self.0[0], 8)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right11(self) -> Self { Self([rotate_u128_right(self.0[0], 11)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right12(self) -> Self { Self([rotate_u128_right(self.0[0], 12)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right16(self) -> Self { Self([rotate_u128_right(self.0[0], 16)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right20(self) -> Self { Self([rotate_u128_right(self.0[0], 20)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right24(self) -> Self { Self([rotate_u128_right(self.0[0], 24)]) } - #[inline] + #[inline(always)] fn rotate_each_word_right25(self) -> Self { Self([rotate_u128_right(self.0[0], 25)]) } } impl RotateEachWord64 for u128x1_generic { - #[inline] + #[inline(always)] fn rotate_each_word_right32(self) -> Self { Self([rotate_u128_right(self.0[0], 32)]) } @@ -383,17 +447,20 @@ impl Machine for GenericMachine { type u32x4x4 = u32x4x4_generic; type u64x2x4 = u64x2x4_generic; type u128x4 = u128x4_generic; - #[inline] + #[inline(always)] unsafe fn instance() -> Self { Self } } -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)] +#[repr(transparent)] pub struct u32x4_generic([u32; 4]); -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)] +#[repr(transparent)] pub struct u64x2_generic([u64; 2]); -#[derive(Copy, Clone, Debug, PartialEq)] +#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)] +#[repr(transparent)] pub struct u128x1_generic([u128; 1]); impl From for vec128_storage { @@ -411,7 +478,7 @@ impl From for vec128_storage { impl From for vec128_storage { #[inline(always)] fn from(o: u128x1_generic) -> Self { - Self { o: o.0 } + Self { q: q_of_o(o.0[0]) } } } @@ -430,7 +497,7 @@ impl Store for u64x2_generic { impl Store for u128x1_generic { #[inline(always)] unsafe fn unpack(s: vec128_storage) -> Self { - Self(s.o) + Self([o_of_q(s.q); 1]) } } @@ -498,53 +565,45 @@ impl BSwap for u128x1_generic { impl StoreBytes for u32x4_generic { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - assert_eq!(input.len(), 16); - let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + let x = u32x4_generic::read_from(input).unwrap(); dmap(x, |x| x.to_le()) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - assert_eq!(input.len(), 16); - let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + let x = u32x4_generic::read_from(input).unwrap(); dmap(x, |x| x.to_be()) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - assert_eq!(out.len(), 16); let x = dmap(self, |x| x.to_le()); - unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + x.write_to(out).unwrap(); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - assert_eq!(out.len(), 16); let x = dmap(self, |x| x.to_be()); - unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + x.write_to(out).unwrap(); } } impl StoreBytes for u64x2_generic { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - assert_eq!(input.len(), 16); - let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + let x = u64x2_generic::read_from(input).unwrap(); qmap(x, |x| x.to_le()) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - assert_eq!(input.len(), 16); - let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16])); + let x = u64x2_generic::read_from(input).unwrap(); qmap(x, |x| x.to_be()) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - assert_eq!(out.len(), 16); let x = qmap(self, |x| x.to_le()); - unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + x.write_to(out).unwrap(); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - assert_eq!(out.len(), 16); let x = qmap(self, |x| x.to_be()); - unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) } + x.write_to(out).unwrap(); } } @@ -560,6 +619,22 @@ pub type u32x4x4_generic = x4; pub type u64x2x4_generic = x4; pub type u128x4_generic = x4; +impl Vector<[u32; 16]> for u32x4x4_generic { + fn to_scalars(self) -> [u32; 16] { + let [a, b, c, d] = self.0; + let a = a.0; + let b = b.0; + let c = c.0; + let d = d.0; + [ + a[0], a[1], a[2], a[3], // + b[0], b[1], b[2], b[3], // + c[0], c[1], c[2], c[3], // + d[0], d[1], d[2], d[3], // + ] + } +} + impl MultiLane<[u32; 4]> for u32x4_generic { #[inline(always)] fn to_lanes(self) -> [u32; 4] { @@ -700,7 +775,7 @@ impl u128x4 for u128x4_generic {} #[macro_export] macro_rules! dispatch { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -717,7 +792,7 @@ macro_rules! dispatch { #[macro_export] macro_rules! dispatch_light128 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -734,7 +809,7 @@ macro_rules! dispatch_light128 { #[macro_export] macro_rules! dispatch_light256 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] @@ -751,7 +826,7 @@ macro_rules! dispatch_light256 { #[macro_export] macro_rules! dispatch_light512 { ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => { - #[inline] + #[inline(always)] $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret { let $mach = unsafe { $crate::generic::GenericMachine::instance() }; #[inline(always)] diff --git a/utils-simd/ppv-lite86/src/lib.rs b/utils-simd/ppv-lite86/src/lib.rs index 43dc5d86..311df97b 100644 --- a/utils-simd/ppv-lite86/src/lib.rs +++ b/utils-simd/ppv-lite86/src/lib.rs @@ -9,14 +9,34 @@ mod soft; mod types; pub use self::types::*; -#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))] +#[cfg(all( + target_arch = "x86_64", + target_feature = "sse2", + not(feature = "no_simd"), + not(miri) +))] pub mod x86_64; -#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))] +#[cfg(all( + target_arch = "x86_64", + target_feature = "sse2", + not(feature = "no_simd"), + not(miri) +))] use self::x86_64 as arch; -#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))] +#[cfg(any( + feature = "no_simd", + miri, + not(target_arch = "x86_64"), + all(target_arch = "x86_64", not(target_feature = "sse2")) +))] pub mod generic; -#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))] +#[cfg(any( + feature = "no_simd", + miri, + not(target_arch = "x86_64"), + all(target_arch = "x86_64", not(target_feature = "sse2")) +))] use self::generic as arch; pub use self::arch::{vec128_storage, vec256_storage, vec512_storage}; diff --git a/utils-simd/ppv-lite86/src/soft.rs b/utils-simd/ppv-lite86/src/soft.rs index d12dac52..b2cf0e19 100644 --- a/utils-simd/ppv-lite86/src/soft.rs +++ b/utils-simd/ppv-lite86/src/soft.rs @@ -1,11 +1,13 @@ //! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD. -use core::marker::PhantomData; -use core::ops::*; use crate::types::*; use crate::{vec128_storage, vec256_storage, vec512_storage}; +use core::marker::PhantomData; +use core::ops::*; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, FromBytes, AsBytes, FromZeroes)] +#[repr(transparent)] #[allow(non_camel_case_types)] pub struct x2(pub [W; 2], PhantomData); impl x2 { @@ -175,28 +177,53 @@ impl BSwap for x2 { impl StoreBytes for x2 { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { - let input = input.split_at(16); + let input = input.split_at(input.len() / 2); x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - x2::unsafe_read_le(input).bswap() + let input = input.split_at(input.len() / 2); + x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)]) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - let out = out.split_at_mut(16); + let out = out.split_at_mut(out.len() / 2); self.0[0].write_le(out.0); self.0[1].write_le(out.1); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - let out = out.split_at_mut(16); + let out = out.split_at_mut(out.len() / 2); self.0[0].write_be(out.0); self.0[1].write_be(out.1); } } +impl LaneWords4 for x2 { + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words2301(), + self.0[1].shuffle_lane_words2301(), + ]) + } + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words1230(), + self.0[1].shuffle_lane_words1230(), + ]) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new([ + self.0[0].shuffle_lane_words3012(), + self.0[1].shuffle_lane_words3012(), + ]) + } +} -#[derive(Copy, Clone, Default)] +#[derive(Copy, Clone, Default, FromBytes, AsBytes, FromZeroes)] +#[repr(transparent)] #[allow(non_camel_case_types)] pub struct x4(pub [W; 4]); impl x4 { @@ -238,7 +265,12 @@ macro_rules! fwd_unop_x4 { ($fn:ident) => { #[inline(always)] fn $fn(self) -> Self { - x4([self.0[0].$fn(), self.0[1].$fn(), self.0[2].$fn(), self.0[3].$fn()]) + x4([ + self.0[0].$fn(), + self.0[1].$fn(), + self.0[2].$fn(), + self.0[3].$fn(), + ]) } }; } @@ -305,6 +337,20 @@ impl Vec4 for x4 { self } } +impl Vec4Ext for x4 { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized, + { + ( + x4([a.0[0], b.0[0], c.0[0], d.0[0]]), + x4([a.0[1], b.0[1], c.0[1], d.0[1]]), + x4([a.0[2], b.0[2], c.0[2], d.0[2]]), + x4([a.0[3], b.0[3], c.0[3], d.0[3]]), + ) + } +} impl> Store for x4 { #[inline(always)] unsafe fn unpack(p: vec512_storage) -> Self { @@ -363,30 +409,39 @@ impl BSwap for x4 { impl StoreBytes for x4 { #[inline(always)] unsafe fn unsafe_read_le(input: &[u8]) -> Self { + let n = input.len() / 4; x4([ - W::unsafe_read_le(&input[0..16]), - W::unsafe_read_le(&input[16..32]), - W::unsafe_read_le(&input[32..48]), - W::unsafe_read_le(&input[48..64]), + W::unsafe_read_le(&input[..n]), + W::unsafe_read_le(&input[n..n * 2]), + W::unsafe_read_le(&input[n * 2..n * 3]), + W::unsafe_read_le(&input[n * 3..]), ]) } #[inline(always)] unsafe fn unsafe_read_be(input: &[u8]) -> Self { - x4::unsafe_read_le(input).bswap() + let n = input.len() / 4; + x4([ + W::unsafe_read_be(&input[..n]), + W::unsafe_read_be(&input[n..n * 2]), + W::unsafe_read_be(&input[n * 2..n * 3]), + W::unsafe_read_be(&input[n * 3..]), + ]) } #[inline(always)] fn write_le(self, out: &mut [u8]) { - self.0[0].write_le(&mut out[0..16]); - self.0[1].write_le(&mut out[16..32]); - self.0[2].write_le(&mut out[32..48]); - self.0[3].write_le(&mut out[48..64]); + let n = out.len() / 4; + self.0[0].write_le(&mut out[..n]); + self.0[1].write_le(&mut out[n..n * 2]); + self.0[2].write_le(&mut out[n * 2..n * 3]); + self.0[3].write_le(&mut out[n * 3..]); } #[inline(always)] fn write_be(self, out: &mut [u8]) { - self.0[0].write_be(&mut out[0..16]); - self.0[1].write_be(&mut out[16..32]); - self.0[2].write_be(&mut out[32..48]); - self.0[3].write_be(&mut out[48..64]); + let n = out.len() / 4; + self.0[0].write_be(&mut out[..n]); + self.0[1].write_be(&mut out[n..n * 2]); + self.0[2].write_be(&mut out[n * 2..n * 3]); + self.0[3].write_be(&mut out[n * 3..]); } } impl LaneWords4 for x4 { diff --git a/utils-simd/ppv-lite86/src/types.rs b/utils-simd/ppv-lite86/src/types.rs index 119b6bb8..f9f3bf1c 100644 --- a/utils-simd/ppv-lite86/src/types.rs +++ b/utils-simd/ppv-lite86/src/types.rs @@ -1,3 +1,4 @@ +#![allow(non_camel_case_types)] use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not}; pub trait AndNot { @@ -44,182 +45,188 @@ pub trait RotateEachWord64 { pub trait RotateEachWord128 {} -#[allow(non_camel_case_types)] -mod types { - //! Vector type naming scheme: - //! uN[xP]xL - //! Unsigned; N-bit words * P bits per lane * L lanes - //! - //! A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of - //! wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and - //! slow inter-lane operations. +// Vector type naming scheme: +// uN[xP]xL +// Unsigned; N-bit words * P bits per lane * L lanes +// +// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of +// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and +// slow inter-lane operations. - use crate::arch::{vec128_storage, vec256_storage, vec512_storage}; - use crate::{ArithOps, BitOps128, BitOps32, BitOps64, Machine, Store, StoreBytes}; +use crate::arch::{vec128_storage, vec256_storage, vec512_storage}; - pub trait UnsafeFrom { - unsafe fn unsafe_from(t: T) -> Self; - } +#[allow(clippy::missing_safety_doc)] +pub trait UnsafeFrom { + unsafe fn unsafe_from(t: T) -> Self; +} - /// A vector composed of two elements, which may be words or themselves vectors. - pub trait Vec2 { - fn extract(self, i: u32) -> W; - fn insert(self, w: W, i: u32) -> Self; - } +/// A vector composed of two elements, which may be words or themselves vectors. +pub trait Vec2 { + fn extract(self, i: u32) -> W; + fn insert(self, w: W, i: u32) -> Self; +} - /// A vector composed of four elements, which may be words or themselves vectors. - pub trait Vec4 { - fn extract(self, i: u32) -> W; - fn insert(self, w: W, i: u32) -> Self; - } +/// A vector composed of four elements, which may be words or themselves vectors. +pub trait Vec4 { + fn extract(self, i: u32) -> W; + fn insert(self, w: W, i: u32) -> Self; +} +/// Vec4 functions which may not be implemented yet for all Vec4 types. +/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage, +/// import Vec4Ext only together with Vec4, and don't qualify its methods. +pub trait Vec4Ext { + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) + where + Self: Sized; +} +pub trait Vector { + fn to_scalars(self) -> T; +} - // TODO: multiples of 4 should inherit this - /// A vector composed of four words; depending on their size, operations may cross lanes. - pub trait Words4 { - fn shuffle1230(self) -> Self; - fn shuffle2301(self) -> Self; - fn shuffle3012(self) -> Self; - } +// TODO: multiples of 4 should inherit this +/// A vector composed of four words; depending on their size, operations may cross lanes. +pub trait Words4 { + fn shuffle1230(self) -> Self; + fn shuffle2301(self) -> Self; + fn shuffle3012(self) -> Self; +} - /// A vector composed one or more lanes each composed of four words. - pub trait LaneWords4 { - fn shuffle_lane_words1230(self) -> Self; - fn shuffle_lane_words2301(self) -> Self; - fn shuffle_lane_words3012(self) -> Self; - } +/// A vector composed one or more lanes each composed of four words. +pub trait LaneWords4 { + fn shuffle_lane_words1230(self) -> Self; + fn shuffle_lane_words2301(self) -> Self; + fn shuffle_lane_words3012(self) -> Self; +} - // TODO: make this a part of BitOps - /// Exchange neigboring ranges of bits of the specified size - pub trait Swap64 { - fn swap1(self) -> Self; - fn swap2(self) -> Self; - fn swap4(self) -> Self; - fn swap8(self) -> Self; - fn swap16(self) -> Self; - fn swap32(self) -> Self; - fn swap64(self) -> Self; - } +// TODO: make this a part of BitOps +/// Exchange neigboring ranges of bits of the specified size +pub trait Swap64 { + fn swap1(self) -> Self; + fn swap2(self) -> Self; + fn swap4(self) -> Self; + fn swap8(self) -> Self; + fn swap16(self) -> Self; + fn swap32(self) -> Self; + fn swap64(self) -> Self; +} - pub trait u32x4: - BitOps32 - + Store - + ArithOps - + Vec4 - + Words4 - + LaneWords4 - + StoreBytes - + MultiLane<[u32; 4]> - + Into - { +pub trait u32x4: + BitOps32 + + Store + + ArithOps + + Vec4 + + Words4 + + LaneWords4 + + StoreBytes + + MultiLane<[u32; 4]> + + Into +{ } - pub trait u64x2: - BitOps64 - + Store - + ArithOps - + Vec2 - + MultiLane<[u64; 2]> - + Into - { +pub trait u64x2: + BitOps64 + Store + ArithOps + Vec2 + MultiLane<[u64; 2]> + Into +{ } - pub trait u128x1: - BitOps128 + Store + Swap64 + MultiLane<[u128; 1]> + Into - { +pub trait u128x1: + BitOps128 + Store + Swap64 + MultiLane<[u128; 1]> + Into +{ } - pub trait u32x4x2: - BitOps32 - + Store - + Vec2 - + MultiLane<[M::u32x4; 2]> - + ArithOps - + Into - { +pub trait u32x4x2: + BitOps32 + + Store + + Vec2 + + MultiLane<[M::u32x4; 2]> + + ArithOps + + Into + + StoreBytes +{ } - pub trait u64x2x2: - BitOps64 - + Store - + Vec2 - + MultiLane<[M::u64x2; 2]> - + ArithOps - + StoreBytes - + Into - { +pub trait u64x2x2: + BitOps64 + + Store + + Vec2 + + MultiLane<[M::u64x2; 2]> + + ArithOps + + StoreBytes + + Into +{ } - pub trait u64x4: - BitOps64 - + Store - + Vec4 - + MultiLane<[u64; 4]> - + ArithOps - + Words4 - + StoreBytes - + Into - { +pub trait u64x4: + BitOps64 + + Store + + Vec4 + + MultiLane<[u64; 4]> + + ArithOps + + Words4 + + StoreBytes + + Into +{ } - pub trait u128x2: - BitOps128 - + Store - + Vec2 - + MultiLane<[M::u128x1; 2]> - + Swap64 - + Into - { +pub trait u128x2: + BitOps128 + + Store + + Vec2 + + MultiLane<[M::u128x1; 2]> + + Swap64 + + Into +{ } - pub trait u32x4x4: - BitOps32 - + Store - + Vec4 - + MultiLane<[M::u32x4; 4]> - + ArithOps - + LaneWords4 - + Into - { +pub trait u32x4x4: + BitOps32 + + Store + + Vec4 + + Vec4Ext + + Vector<[u32; 16]> + + MultiLane<[M::u32x4; 4]> + + ArithOps + + LaneWords4 + + Into + + StoreBytes +{ } - pub trait u64x2x4: - BitOps64 - + Store - + Vec4 - + MultiLane<[M::u64x2; 4]> - + ArithOps - + Into - { +pub trait u64x2x4: + BitOps64 + + Store + + Vec4 + + MultiLane<[M::u64x2; 4]> + + ArithOps + + Into +{ } - // TODO: Words4 - pub trait u128x4: - BitOps128 - + Store - + Vec4 - + MultiLane<[M::u128x1; 4]> - + Swap64 - + Into - { +// TODO: Words4 +pub trait u128x4: + BitOps128 + + Store + + Vec4 + + MultiLane<[M::u128x1; 4]> + + Swap64 + + Into +{ } - /// A vector composed of multiple 128-bit lanes. - pub trait MultiLane { - /// Split a multi-lane vector into single-lane vectors. - fn to_lanes(self) -> Lanes; - /// Build a multi-lane vector from individual lanes. - fn from_lanes(lanes: Lanes) -> Self; - } +/// A vector composed of multiple 128-bit lanes. +pub trait MultiLane { + /// Split a multi-lane vector into single-lane vectors. + fn to_lanes(self) -> Lanes; + /// Build a multi-lane vector from individual lanes. + fn from_lanes(lanes: Lanes) -> Self; +} - /// Combine single vectors into a multi-lane vector. - pub trait VZip { - fn vzip(self) -> V; - } +/// Combine single vectors into a multi-lane vector. +pub trait VZip { + fn vzip(self) -> V; +} - impl VZip for T - where - V: MultiLane, - { - #[inline(always)] - fn vzip(self) -> V { - V::from_lanes(self) - } +impl VZip for T +where + V: MultiLane, +{ + #[inline(always)] + fn vzip(self) -> V { + V::from_lanes(self) } } -pub use self::types::*; pub trait Machine: Sized + Copy { type u32x4: u32x4; @@ -264,15 +271,27 @@ pub trait Machine: Sized + Copy { unsafe { V::unsafe_read_be(input) } } + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn instance() -> Self; } pub trait Store { + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn unpack(p: S) -> Self; } pub trait StoreBytes { + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn unsafe_read_le(input: &[u8]) -> Self; + /// # Safety + /// Caller must ensure the type of Self is appropriate for the hardware of the execution + /// environment. unsafe fn unsafe_read_be(input: &[u8]) -> Self; fn write_le(self, out: &mut [u8]); fn write_be(self, out: &mut [u8]); diff --git a/utils-simd/ppv-lite86/src/x86_64/mod.rs b/utils-simd/ppv-lite86/src/x86_64/mod.rs index 39d3b900..9d22c0d6 100644 --- a/utils-simd/ppv-lite86/src/x86_64/mod.rs +++ b/utils-simd/ppv-lite86/src/x86_64/mod.rs @@ -1,7 +1,8 @@ // crate minimums: sse2, x86_64 -use core::arch::x86_64::{__m128i, __m256i}; use crate::types::*; +use core::arch::x86_64::{__m128i, __m256i}; +use zerocopy::{AsBytes, FromBytes, FromZeroes}; mod sse2; @@ -79,7 +80,7 @@ where type u64x2 = sse2::u64x2_sse2; type u128x1 = sse2::u128x1_sse2; - type u32x4x2 = sse2::u32x4x2_sse2; + type u32x4x2 = sse2::avx2::u32x4x2_avx2; type u64x2x2 = sse2::u64x2x2_sse2; type u64x4 = sse2::u64x4_sse2; type u128x2 = sse2::u128x2_sse2; @@ -106,7 +107,8 @@ pub type AVX2 = Avx2Machine; /// Converting into and out of this type should be essentially free, although it may be more /// aligned than a particular impl requires. #[allow(non_camel_case_types)] -#[derive(Copy, Clone)] +#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)] +#[repr(C)] pub union vec128_storage { u32x4: [u32; 4], u64x2: [u64; 2], @@ -119,16 +121,16 @@ impl Store for vec128_storage { p } } -impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage { +impl<'a> From<&'a vec128_storage> for &'a [u32; 4] { #[inline(always)] - fn into(self) -> &'a [u32; 4] { - unsafe { &self.u32x4 } + fn from(x: &'a vec128_storage) -> Self { + unsafe { &x.u32x4 } } } -impl Into for [u32; 4] { +impl From<[u32; 4]> for vec128_storage { #[inline(always)] - fn into(self) -> vec128_storage { - vec128_storage { u32x4: self } + fn from(u32x4: [u32; 4]) -> Self { + vec128_storage { u32x4 } } } impl Default for vec128_storage { @@ -137,6 +139,13 @@ impl Default for vec128_storage { vec128_storage { u128x1: [0] } } } +impl Eq for vec128_storage {} +impl PartialEq for vec128_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.u128x1 == rhs.u128x1 } + } +} #[allow(non_camel_case_types)] #[derive(Copy, Clone)] @@ -147,10 +156,10 @@ pub union vec256_storage { sse2: [vec128_storage; 2], avx: __m256i, } -impl Into for [u64; 4] { +impl From<[u64; 4]> for vec256_storage { #[inline(always)] - fn into(self) -> vec256_storage { - vec256_storage { u64x4: self } + fn from(u64x4: [u64; 4]) -> Self { + vec256_storage { u64x4 } } } impl Default for vec256_storage { @@ -160,13 +169,22 @@ impl Default for vec256_storage { } } impl vec256_storage { + #[inline(always)] pub fn new128(xs: [vec128_storage; 2]) -> Self { Self { sse2: xs } } + #[inline(always)] pub fn split128(self) -> [vec128_storage; 2] { unsafe { self.sse2 } } } +impl Eq for vec256_storage {} +impl PartialEq for vec256_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.sse2 == rhs.sse2 } + } +} #[allow(non_camel_case_types)] #[derive(Copy, Clone)] @@ -186,20 +204,29 @@ impl Default for vec512_storage { } } impl vec512_storage { + #[inline(always)] pub fn new128(xs: [vec128_storage; 4]) -> Self { Self { sse2: xs } } + #[inline(always)] pub fn split128(self) -> [vec128_storage; 4] { unsafe { self.sse2 } } } +impl Eq for vec512_storage {} +impl PartialEq for vec512_storage { + #[inline(always)] + fn eq(&self, rhs: &Self) -> bool { + unsafe { self.avx == rhs.avx } + } +} macro_rules! impl_into { ($storage:ident, $array:ty, $name:ident) => { - impl Into<$array> for $storage { + impl From<$storage> for $array { #[inline(always)] - fn into(self) -> $array { - unsafe { self.$name } + fn from(vec: $storage) -> Self { + unsafe { vec.$name } } } }; diff --git a/utils-simd/ppv-lite86/src/x86_64/sse2.rs b/utils-simd/ppv-lite86/src/x86_64/sse2.rs index 81021a99..4b95911d 100644 --- a/utils-simd/ppv-lite86/src/x86_64/sse2.rs +++ b/utils-simd/ppv-lite86/src/x86_64/sse2.rs @@ -9,6 +9,7 @@ use core::marker::PhantomData; use core::ops::{ Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not, }; +use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes}; macro_rules! impl_binop { ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => { @@ -39,7 +40,8 @@ macro_rules! impl_binop_assign { macro_rules! def_vec { ($vec:ident, $word:ident) => { #[allow(non_camel_case_types)] - #[derive(Copy, Clone)] + #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)] + #[repr(transparent)] pub struct $vec { x: __m128i, s3: PhantomData, @@ -166,49 +168,44 @@ macro_rules! impl_bitops128 { macro_rules! rotr_32_s3 { ($name:ident, $k0:expr, $k1:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_shuffle_epi8( - self.x, - _mm_set_epi64x($k0, $k1), - ) - }) + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) } }; } macro_rules! rotr_32 { ($name:ident, $i:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_or_si128( - _mm_srli_epi32(self.x, $i as i32), - _mm_slli_epi32(self.x, 32 - $i as i32), - ) - }) - } + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi32(self.x, $i as i32), + _mm_slli_epi32(self.x, 32 - $i as i32), + ) + }) + } }; } impl RotateEachWord32 for u32x4_sse2 { rotr_32!(rotate_each_word_right7, 7); rotr_32_s3!( rotate_each_word_right8, - 0x0c0f0e0d_080b0a09, - 0x04070605_00030201 + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); rotr_32_s3!( rotate_each_word_right16, - 0x0d0c0f0e_09080b0a, - 0x05040706_01000302 + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 ); rotr_32!(rotate_each_word_right20, 20); rotr_32_s3!( rotate_each_word_right24, - 0x0e0d0c0f_0a09080b, - 0x06050407_02010003 + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 ); rotr_32!(rotate_each_word_right25, 25); } @@ -228,28 +225,23 @@ impl RotateEachWord32 for u32x4_sse2 { macro_rules! rotr_64_s3 { ($name:ident, $k0:expr, $k1:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_shuffle_epi8( - self.x, - _mm_set_epi64x($k0, $k1), - ) - }) + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) }) } }; } macro_rules! rotr_64 { ($name:ident, $i:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_or_si128( - _mm_srli_epi64(self.x, $i as i32), - _mm_slli_epi64(self.x, 64 - $i as i32), - ) - }) - } + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_epi64(self.x, $i as i32), + _mm_slli_epi64(self.x, 64 - $i as i32), + ) + }) + } }; } impl RotateEachWord32 for u64x2_sse2 { @@ -296,15 +288,15 @@ impl RotateEachWord64 for u64x2_sse2 { macro_rules! rotr_128 { ($name:ident, $i:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - _mm_or_si128( - _mm_srli_si128(self.x, $i as i32), - _mm_slli_si128(self.x, 128 - $i as i32), - ) - }) - } + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm_or_si128( + _mm_srli_si128(self.x, $i as i32), + _mm_slli_si128(self.x, 128 - $i as i32), + ) + }) + } }; } // TODO: completely unoptimized @@ -411,7 +403,7 @@ impl MultiLane<[u128; 1]> for u128x1_sse2 { } #[inline(always)] fn from_lanes(xs: [u128; 1]) -> Self { - unimplemented!() + unimplemented!("{:?}", xs) } } @@ -780,7 +772,7 @@ impl BSwap for u128x1_sse2 { impl BSwap for u128x1_sse2 { #[inline(always)] fn bswap(self) -> Self { - Self::new(unsafe { unimplemented!() }) + unimplemented!() } } @@ -890,6 +882,13 @@ pub type u64x2x4_sse2 = x4>; #[allow(non_camel_case_types)] pub type u128x4_sse2 = x4>; +impl Vector<[u32; 16]> for u32x4x4_sse2 { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + transmute!(self) + } +} + impl u32x4x2> for u32x4x2_sse2 where u32x4_sse2: RotateEachWord32 + BSwap, @@ -993,6 +992,8 @@ where Machine86: Machine, u32x4x4_sse2: MultiLane<[ as Machine>::u32x4; 4]>, u32x4x4_sse2: Vec4< as Machine>::u32x4>, + u32x4x4_sse2: Vec4Ext< as Machine>::u32x4>, + u32x4x4_sse2: Vector<[u32; 16]>, { } impl u64x2x4> for u64x2x4_sse2 @@ -1014,14 +1015,6 @@ where { } -impl u32x4x4> for u32x4x4_sse2 -where - u32x4_sse2: RotateEachWord32 + BSwap, - Avx2Machine: Machine, - u32x4x4_sse2: MultiLane<[ as Machine>::u32x4; 4]>, - u32x4x4_sse2: Vec4< as Machine>::u32x4>, -{ -} impl u64x2x4> for u64x2x4_sse2 where u64x2_sse2: RotateEachWord64 + RotateEachWord32 + BSwap, @@ -1078,6 +1071,7 @@ impl PartialEq for x2 { } } +#[allow(unused)] #[inline(always)] unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool { let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110); @@ -1136,13 +1130,14 @@ where } #[cfg(test)] +#[cfg(target_arch = "x86_64")] mod test { use super::*; use crate::x86_64::{SSE2, SSE41, SSSE3}; use crate::Machine; #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_bswap32_s2_vs_s3() { let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100]; let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203]; @@ -1160,12 +1155,12 @@ mod test { x_s3.bswap() }; - assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s2, transmute!(x_s3)); assert_eq!(x_s2, s2.vec(ys)); } #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_bswap64_s2_vs_s3() { let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100]; let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607]; @@ -1184,11 +1179,11 @@ mod test { }; assert_eq!(x_s2, s2.vec(ys)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); } #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_shuffle32_s2_vs_s3() { let xs = [0x0, 0x1, 0x2, 0x3]; let ys = [0x2, 0x3, 0x0, 0x1]; @@ -1206,7 +1201,7 @@ mod test { x_s3.shuffle2301() }; assert_eq!(x_s2, s2.vec(ys)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); let x_s2 = { let x_s2: ::u32x4 = s2.vec(xs); @@ -1217,16 +1212,16 @@ mod test { x_s3.shuffle3012() }; assert_eq!(x_s2, s2.vec(zs)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); let x_s2 = x_s2.shuffle1230(); let x_s3 = x_s3.shuffle1230(); assert_eq!(x_s2, s2.vec(xs)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); } #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(target_feature = "ssse3"), ignore)] fn test_shuffle64_s2_vs_s3() { let xs = [0x0, 0x1, 0x2, 0x3]; let ys = [0x2, 0x3, 0x0, 0x1]; @@ -1244,7 +1239,7 @@ mod test { x_s3.shuffle2301() }; assert_eq!(x_s2, s2.vec(ys)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); let x_s2 = { let x_s2: ::u64x4 = s2.vec(xs); @@ -1255,16 +1250,16 @@ mod test { x_s3.shuffle3012() }; assert_eq!(x_s2, s2.vec(zs)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); let x_s2 = x_s2.shuffle1230(); let x_s3 = x_s3.shuffle1230(); assert_eq!(x_s2, s2.vec(xs)); - assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) }); + assert_eq!(x_s3, transmute!(x_s3)); } + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] #[test] - #[cfg(target_arch = "x86_64")] fn test_lanes_u32x4() { let xs = [0x1, 0x2, 0x3, 0x4]; @@ -1295,7 +1290,7 @@ mod test { } #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] fn test_lanes_u64x2() { let xs = [0x1, 0x2]; @@ -1326,7 +1321,6 @@ mod test { } #[test] - #[cfg(target_arch = "x86_64")] fn test_vec4_u32x4_s2() { let xs = [1, 2, 3, 4]; let s2 = unsafe { SSE2::instance() }; @@ -1342,7 +1336,7 @@ mod test { } #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] fn test_vec4_u32x4_s4() { let xs = [1, 2, 3, 4]; let s4 = unsafe { SSE41::instance() }; @@ -1358,7 +1352,6 @@ mod test { } #[test] - #[cfg(target_arch = "x86_64")] fn test_vec2_u64x2_s2() { let xs = [0x1, 0x2]; let s2 = unsafe { SSE2::instance() }; @@ -1370,7 +1363,7 @@ mod test { } #[test] - #[cfg(target_arch = "x86_64")] + #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)] fn test_vec4_u64x2_s4() { let xs = [0x1, 0x2]; let s4 = unsafe { SSE41::instance() }; @@ -1384,65 +1377,80 @@ mod test { pub mod avx2 { #![allow(non_camel_case_types)] - use crate::soft::x4; + use crate::soft::{x2, x4}; use crate::types::*; - use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2}; + use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0}; use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4}; use core::arch::x86_64::*; use core::marker::PhantomData; use core::ops::*; + use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes}; - #[derive(Copy, Clone)] - pub struct u32x4x4_avx2 { - x: [__m256i; 2], + #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)] + #[repr(transparent)] + pub struct u32x4x2_avx2 { + x: __m256i, ni: PhantomData, } - impl u32x4x4_avx2 { + impl u32x4x2_avx2 { #[inline(always)] - fn new(x: [__m256i; 2]) -> Self { + fn new(x: __m256i) -> Self { Self { x, ni: PhantomData } } } - impl u32x4x4> for u32x4x4_avx2 where NI: Copy {} - impl Store for u32x4x4_avx2 { + impl u32x4x2> for u32x4x2_avx2 where NI: Copy {} + impl Store for u32x4x2_avx2 { #[inline(always)] - unsafe fn unpack(p: vec512_storage) -> Self { - Self::new([p.avx[0].avx, p.avx[1].avx]) + unsafe fn unpack(p: vec256_storage) -> Self { + Self::new(p.avx) } } - impl MultiLane<[u32x4_sse2; 4]> for u32x4x4_avx2 { + impl StoreBytes for u32x4x2_avx2 { #[inline(always)] - fn to_lanes(self) -> [u32x4_sse2; 4] { + unsafe fn unsafe_read_le(input: &[u8]) -> Self { + assert_eq!(input.len(), 32); + Self::new(_mm256_loadu_si256(input.as_ptr() as *const _)) + } + #[inline(always)] + unsafe fn unsafe_read_be(input: &[u8]) -> Self { + Self::unsafe_read_le(input).bswap() + } + #[inline(always)] + fn write_le(self, out: &mut [u8]) { unsafe { - [ - u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), - u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), - ] + assert_eq!(out.len(), 32); + _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x) } } #[inline(always)] - fn from_lanes(x: [u32x4_sse2; 4]) -> Self { - Self::new(unsafe { + fn write_be(self, out: &mut [u8]) { + self.bswap().write_le(out) + } + } + impl MultiLane<[u32x4_sse2; 2]> for u32x4x2_avx2 { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2; 2] { + unsafe { [ - _mm256_setr_m128i(x[0].x, x[1].x), - _mm256_setr_m128i(x[2].x, x[3].x), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), ] - }) + } + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2; 2]) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) }) } } - impl Vec4> for u32x4x4_avx2 { + impl Vec2> for u32x4x2_avx2 { #[inline(always)] fn extract(self, i: u32) -> u32x4_sse2 { unsafe { match i { - 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)), - 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)), - 2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)), - 3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)), + 0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)), + 1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)), _ => panic!(), } } @@ -1451,61 +1459,21 @@ pub mod avx2 { fn insert(self, w: u32x4_sse2, i: u32) -> Self { Self::new(unsafe { match i { - 0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]], - 1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]], - 2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)], - 3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)], + 0 => _mm256_inserti128_si256(self.x, w.x, 0), + 1 => _mm256_inserti128_si256(self.x, w.x, 1), _ => panic!(), } }) } } - impl LaneWords4 for u32x4x4_avx2 { - #[inline(always)] - fn shuffle_lane_words1230(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b1001_0011), - _mm256_shuffle_epi32(self.x[1], 0b1001_0011), - ] - }) - } - #[inline(always)] - fn shuffle_lane_words2301(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b0100_1110), - _mm256_shuffle_epi32(self.x[1], 0b0100_1110), - ] - }) - } - #[inline(always)] - fn shuffle_lane_words3012(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi32(self.x[0], 0b0011_1001), - _mm256_shuffle_epi32(self.x[1], 0b0011_1001), - ] - }) - } - } - impl BitOps32 for u32x4x4_avx2 where NI: Copy {} - impl ArithOps for u32x4x4_avx2 where NI: Copy {} + impl BitOps32 for u32x4x2_avx2 where NI: Copy {} + impl ArithOps for u32x4x2_avx2 where NI: Copy {} macro_rules! shuf_lane_bytes { ($name:ident, $k0:expr, $k1:expr) => { - #[inline(always)] - fn $name(self) -> Self { - Self::new(unsafe { - [ - _mm256_shuffle_epi8( - self.x[0], - _mm256_set_epi64x($k0, $k1, $k0, $k1), - ), - _mm256_shuffle_epi8( - self.x[1], - _mm256_set_epi64x($k0, $k1, $k0, $k1), - ) - ] + #[inline(always)] + fn $name(self) -> Self { + Self::new(unsafe { + _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1)) }) } }; @@ -1515,52 +1483,41 @@ pub mod avx2 { #[inline(always)] fn $name(self) -> Self { Self::new(unsafe { - [ - _mm256_or_si256( - _mm256_srli_epi32(self.x[0], $i as i32), - _mm256_slli_epi32(self.x[0], 32 - $i as i32), - ), - _mm256_or_si256( - _mm256_srli_epi32(self.x[1], $i as i32), - _mm256_slli_epi32(self.x[1], 32 - $i as i32), - ) - ] + _mm256_or_si256( + _mm256_srli_epi32(self.x, $i as i32), + _mm256_slli_epi32(self.x, 32 - $i as i32), + ) }) } }; } - impl RotateEachWord32 for u32x4x4_avx2 { + impl RotateEachWord32 for u32x4x2_avx2 { rotr_32!(rotate_each_word_right7, 7); shuf_lane_bytes!( rotate_each_word_right8, - 0x0c0f0e0d_080b0a09, - 0x04070605_00030201 + 0x0c0f_0e0d_080b_0a09, + 0x0407_0605_0003_0201 ); rotr_32!(rotate_each_word_right11, 11); rotr_32!(rotate_each_word_right12, 12); shuf_lane_bytes!( rotate_each_word_right16, - 0x0d0c0f0e_09080b0a, - 0x05040706_01000302 + 0x0d0c_0f0e_0908_0b0a, + 0x0504_0706_0100_0302 ); rotr_32!(rotate_each_word_right20, 20); shuf_lane_bytes!( rotate_each_word_right24, - 0x0e0d0c0f_0a09080b, - 0x06050407_02010003 + 0x0e0d_0c0f_0a09_080b, + 0x0605_0407_0201_0003 ); rotr_32!(rotate_each_word_right25, 25); } - impl BitOps0 for u32x4x4_avx2 where NI: Copy {} - impl From> for vec512_storage { + impl BitOps0 for u32x4x2_avx2 where NI: Copy {} + impl From> for vec256_storage { #[inline(always)] - fn from(x: u32x4x4_avx2) -> Self { - Self { - avx: [ - vec256_storage { avx: x.x[0] }, - vec256_storage { avx: x.x[1] }, - ], - } + fn from(x: u32x4x2_avx2) -> Self { + Self { avx: x.x } } } @@ -1577,55 +1534,172 @@ pub mod avx2 { } }; } - impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor); - impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor); - impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand); - impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add); + impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor); + impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor); + impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand); + impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add); - macro_rules! impl_bitop_x2 { + macro_rules! impl_bitop { ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => { impl $Op for $vec { type Output = Self; #[inline(always)] fn $op_fn(self, rhs: Self) -> Self::Output { - Self::new(unsafe { - [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])] - }) + Self::new(unsafe { $impl_fn(self.x, rhs.x) }) } } }; } - impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256); - impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256); - impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256); - impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256); - impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32); + impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256); + impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256); + impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256); + impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256); + impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32); - impl Not for u32x4x4_avx2 { + impl Not for u32x4x2_avx2 { type Output = Self; #[inline(always)] fn not(self) -> Self::Output { unsafe { let f = _mm256_set1_epi8(-0x7f); - Self::new([f, f]) ^ self + Self::new(f) ^ self } } } - impl BSwap for u32x4x4_avx2 { + impl BSwap for u32x4x2_avx2 { shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203); } - impl From>> for u32x4x4_avx2 + impl From, G0>> for u32x4x2_avx2 where NI: Copy, { + #[inline(always)] + fn from(x: x2, G0>) -> Self { + Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) }) + } + } + + impl LaneWords4 for u32x4x2_avx2 { + #[inline(always)] + fn shuffle_lane_words1230(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) }) + } + #[inline(always)] + fn shuffle_lane_words2301(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) }) + } + #[inline(always)] + fn shuffle_lane_words3012(self) -> Self { + Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) }) + } + } + + /////////////////////////////////////////////////////////////////////////////////////////// + + pub type u32x4x4_avx2 = x2, G0>; + impl u32x4x4> for u32x4x4_avx2 {} + + impl Store for u32x4x4_avx2 { + #[inline(always)] + unsafe fn unpack(p: vec512_storage) -> Self { + Self::new([ + u32x4x2_avx2::unpack(p.avx[0]), + u32x4x2_avx2::unpack(p.avx[1]), + ]) + } + } + impl MultiLane<[u32x4_sse2; 4]> for u32x4x4_avx2 { + #[inline(always)] + fn to_lanes(self) -> [u32x4_sse2; 4] { + let [a, b] = self.0[0].to_lanes(); + let [c, d] = self.0[1].to_lanes(); + [a, b, c, d] + } + #[inline(always)] + fn from_lanes(x: [u32x4_sse2; 4]) -> Self { + let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]); + let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]); + Self::new([ab, cd]) + } + } + impl Vec4> for u32x4x4_avx2 { + #[inline(always)] + fn extract(self, i: u32) -> u32x4_sse2 { + match i { + 0 => self.0[0].extract(0), + 1 => self.0[0].extract(1), + 2 => self.0[1].extract(0), + 3 => self.0[1].extract(1), + _ => panic!(), + } + } + #[inline(always)] + fn insert(self, w: u32x4_sse2, i: u32) -> Self { + Self::new(match i { + 0 | 1 => [self.0[0].insert(w, i), self.0[1]], + 2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)], + _ => panic!(), + }) + } + } + impl Vec4Ext> for u32x4x4_avx2 { + #[inline(always)] + fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) { + /* + * a00:a01 a10:a11 + * b00:b01 b10:b11 + * c00:c01 c10:c11 + * d00:d01 d10:d11 + * => + * a00:b00 c00:d00 + * a01:b01 c01:d01 + * a10:b10 c10:d10 + * a11:b11 c11:d11 + */ + unsafe { + let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20)); + let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31)); + let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20)); + let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31)); + let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20)); + let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31)); + let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20)); + let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31)); + ( + Self::new([ab00, cd00]), + Self::new([ab01, cd01]), + Self::new([ab10, cd10]), + Self::new([ab11, cd11]), + ) + } + } + } + impl Vector<[u32; 16]> for u32x4x4_avx2 { + #[inline(always)] + fn to_scalars(self) -> [u32; 16] { + transmute!(self) + } + } + impl From> for vec512_storage { + #[inline(always)] + fn from(x: u32x4x4_avx2) -> Self { + Self { + avx: [ + vec256_storage { avx: x.0[0].x }, + vec256_storage { avx: x.0[1].x }, + ], + } + } + } + impl From>> for u32x4x4_avx2 { #[inline(always)] fn from(x: x4>) -> Self { Self::new(unsafe { [ - _mm256_setr_m128i(x.0[0].x, x.0[1].x), - _mm256_setr_m128i(x.0[2].x, x.0[3].x), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)), + u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)), ] }) } diff --git a/utils-simd/ppv-null/Cargo.toml b/utils-simd/ppv-null/Cargo.toml index ce778a74..d18b096e 100644 --- a/utils-simd/ppv-null/Cargo.toml +++ b/utils-simd/ppv-null/Cargo.toml @@ -2,12 +2,13 @@ name = "ppv-null" version = "0.2.0" authors = ["The CryptoCorrosion Contributors"] -edition = "2018" +edition = "2021" license = "MIT/Apache-2.0" description = "Safe, portable, non-SIMD implementation of the crypto-simd API" repository = "https://github.com/cryptocorrosion/cryptocorrosion" keywords = ["crypto", "simd"] categories = ["cryptography", "no-std"] +rust-version = "1.61" [dependencies] crypto-simd = "0.1"