diff --git a/.github/FUNDING.yml b/.github/FUNDING.yml
new file mode 100644
index 00000000..6b83dc1f
--- /dev/null
+++ b/.github/FUNDING.yml
@@ -0,0 +1,3 @@
+# These are supported funding model platforms
+
+github: kazcw
diff --git a/.github/workflows/check-clippy.yaml b/.github/workflows/check-clippy.yaml
new file mode 100644
index 00000000..bdc77a89
--- /dev/null
+++ b/.github/workflows/check-clippy.yaml
@@ -0,0 +1,18 @@
+name: Clippy
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+env:
+  CARGO_TERM_COLOR: always
+
+jobs:
+  clippy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: cargo-clippy
+        run: cargo clippy
diff --git a/.github/workflows/check-rustfmt.yaml b/.github/workflows/check-rustfmt.yaml
new file mode 100644
index 00000000..fb582971
--- /dev/null
+++ b/.github/workflows/check-rustfmt.yaml
@@ -0,0 +1,15 @@
+name: Rustfmt
+
+on:
+  push:
+    branches: [ "master" ]
+  pull_request:
+    branches: [ "master" ]
+
+jobs:
+  format:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: cargo-fmt
+        run: cargo fmt -- --check
diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml
new file mode 100644
index 00000000..3cb563a0
--- /dev/null
+++ b/.github/workflows/tests.yaml
@@ -0,0 +1,168 @@
+name: Tests
+
+on:
+  push:
+    branches: [ master ]
+  pull_request:
+    branches: [ master ]
+
+permissions:
+  contents: read
+
+jobs:
+  #check-doc:
+  #  name: Check doc
+  #  runs-on: ubuntu-latest
+  #  env:
+  #    RUSTDOCFLAGS: "-Dwarnings --cfg docsrs -Zunstable-options --generate-link-to-definition"
+  #  steps:
+  #    - uses: actions/checkout@v4
+  #    - name: Install toolchain
+  #      uses: dtolnay/rust-toolchain@master
+  #      with:
+  #        toolchain: nightly
+  #    - name: Workspace docs
+  #      run: cargo doc --all-features --no-deps
+
+  test:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-latest
+            target: x86_64-unknown-linux-gnu
+            toolchain: stable
+          - os: macos-latest
+            target: x86_64-apple-darwin
+            toolchain: stable
+            # TODO: also aarch64 / M1
+          - os: windows-latest
+            target: x86_64-pc-windows-gnu
+            toolchain: stable
+          - os: windows-latest
+            target: x86_64-pc-windows-msvc
+            toolchain: beta
+            # Test both windows-gnu and windows-msvc; use beta rust on one
+          - os: ubuntu-latest
+            target: x86_64-unknown-linux-gnu
+            variant: MSRV
+            toolchain: 1.61.0
+          # FIXME: some failures down the dependency tree
+          #- os: ubuntu-latest
+          #  target: x86_64-unknown-linux-gnu
+          #  toolchain: nightly
+          #  variant: minimal_versions
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: MSRV
+        if: ${{ matrix.variant == 'MSRV' }}
+        run: cp Cargo.lock.msrv Cargo.lock
+      - name: Install toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          target: ${{ matrix.target }}
+          toolchain: ${{ matrix.toolchain }}
+      - run: ${{ matrix.deps }}
+      - name: Maybe minimal versions
+        if: ${{ matrix.variant == 'minimal_versions' }}
+        run: |
+          cargo generate-lockfile -Z minimal-versions
+      - name: Test
+        run: |
+          cargo test --target ${{ matrix.target }}
+
+  test-cross:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - os: ubuntu-latest
+            target: powerpc-unknown-linux-gnu
+            toolchain: stable
+
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install toolchain
+        uses: dtolnay/rust-toolchain@master
+        with:
+          target: ${{ matrix.target }}
+          toolchain: ${{ matrix.toolchain }}
+      - name: Cache cargo plugins
+        uses: actions/cache@v4
+        with:
+          path: ~/.cargo/bin/
+          key: ${{ runner.os }}-cargo-plugins
+      - name: Install cross
+        run: cargo install cross || true
+      - name: Test
+        run: |
+          cross test --no-fail-fast --target ${{ matrix.target }} -p c2-chacha
+          cross test --no-fail-fast --target ${{ matrix.target }} -p ppv-lite86
+          cross test --no-fail-fast --target ${{ matrix.target }} -p ppv-null
+          cross test --no-fail-fast --target ${{ matrix.target }} -p crypto-simd
+          cross test --no-fail-fast --target ${{ matrix.target }} -p threefish-cipher
+          cross test --no-fail-fast --target ${{ matrix.target }} -p blake-hash
+          cross test --no-fail-fast --target ${{ matrix.target }} -p skein-hash
+          # Failing on PPC
+          # cross test --no-fail-fast --target ${{ matrix.target }} -p jh-x86_64
+          # groestl-aesni: not cross-tested as it only supports specific hardware.
+
+  test-miri:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install toolchain
+        run: |
+          rustup toolchain install nightly --component miri
+          rustup override set nightly
+          cargo miri setup
+      - name: Test
+        run: |
+          cargo miri test -p c2-chacha
+          cargo miri test -p ppv-lite86
+          cargo miri test -p ppv-null
+          cargo miri test -p crypto-simd
+          cargo miri test -p threefish-cipher
+          cargo miri test -p blake-hash
+          cargo miri test -p skein-hash
+          # groestl-aesni: not tested as it only supports specific hardware.
+          # jh-x86_64: should work under miri but runs too slowly.
+
+  test-no-std:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install toolchain
+        uses: dtolnay/rust-toolchain@nightly
+        with:
+            target: thumbv6m-none-eabi
+      - name: Chacha, build only
+        run: cargo build -p c2-chacha --target=thumbv6m-none-eabi --no-default-features
+
+  test-ios:
+    runs-on: macos-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Install toolchain
+        uses: dtolnay/rust-toolchain@nightly
+        with:
+            target: aarch64-apple-ios
+      - name: Chacha, build only
+        run: cargo build -p c2-chacha --target=aarch64-apple-ios
+
+  test-686:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Deps
+        run: sudo apt-get update ; sudo apt install gcc-multilib
+      - name: Install toolchain
+        uses: dtolnay/rust-toolchain@nightly
+        with:
+          target: i686-unknown-linux-gnu
+          toolchain: nightly
+      - name: Chacha
+        run: cargo test -p c2-chacha --target=i686-unknown-linux-gnu
diff --git a/.travis.yml b/.travis.yml
index 9752ea52..4b54219e 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -75,8 +75,13 @@ matrix:
       rust: nightly
     - env: TARGET=x86_64-unknown-linux-gnu
       rust: stable
+    - env: TARGET=x86_64-unknown-linux-gnu OLDER_MSRV_CRATES=1 DISABLE_TESTS=1
+      rust: 1.32.0
     - env: TARGET=x86_64-unknown-linux-gnu
-      rust: 1.31.1
+      rust: 1.41.0
+
+    # machine-specific tests are skipped based on static feature detection
+    - env: TARGET=x86_64-unknown-linux-gnu RUSTFLAGS="-C target-cpu=native"
 
 before_install:
   - set -e
diff --git a/Cargo.lock.msrv b/Cargo.lock.msrv
new file mode 100644
index 00000000..b0aaa00a
--- /dev/null
+++ b/Cargo.lock.msrv
@@ -0,0 +1,230 @@
+# This file is automatically @generated by Cargo.
+# It is not intended for manual editing.
+version = 3
+
+[[package]]
+name = "autocfg"
+version = "1.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0"
+
+[[package]]
+name = "blake-hash"
+version = "0.4.1"
+dependencies = [
+ "block-buffer",
+ "digest",
+ "ppv-lite86",
+]
+
+[[package]]
+name = "blobby"
+version = "0.1.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6fe5f8c2940b65859ece4b3b2ba02d2b12c87cab455fd42dee2556a187bb2cf6"
+dependencies = [
+ "byteorder",
+]
+
+[[package]]
+name = "block-buffer"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4152116fd6e9dadb291ae18fc1ec3575ed6d84c29642d97890f4b4a3417297e4"
+dependencies = [
+ "block-padding",
+ "generic-array",
+]
+
+[[package]]
+name = "block-padding"
+version = "0.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8d696c370c750c948ada61c69a0ee2cbbb9c50b1019ddb86d9317157a99c2cae"
+
+[[package]]
+name = "byteorder"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
+
+[[package]]
+name = "c2-chacha"
+version = "0.3.3"
+dependencies = [
+ "cipher",
+ "hex-literal",
+ "ppv-lite86",
+]
+
+[[package]]
+name = "cc"
+version = "1.1.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "26a5c3fd7bfa1ce3897a3a3501d362b2d87b7f2583ebcb4a949ec25911025cbc"
+
+[[package]]
+name = "cfg-if"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
+
+[[package]]
+name = "cipher"
+version = "0.3.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7ee52072ec15386f770805afd189a01c8841be8696bed250fa2f13c4c0d6dfb7"
+dependencies = [
+ "generic-array",
+]
+
+[[package]]
+name = "crypto-simd"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "28a0eee94b5af99ac4441823c99f59b1ef92a6a4b9723b4c6ad95e8cd4c994b2"
+
+[[package]]
+name = "crypto-simd"
+version = "0.2.0"
+dependencies = [
+ "packed_simd",
+]
+
+[[package]]
+name = "digest"
+version = "0.9.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d3dd60d1080a57a05ab032377049e0591415d2b31afd7028356dbf3cc6dcb066"
+dependencies = [
+ "blobby",
+ "generic-array",
+]
+
+[[package]]
+name = "generic-array"
+version = "0.14.7"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "85649ca51fd72272d7821adaf274ad91c288277713d9c18820d8499a7ff69e9a"
+dependencies = [
+ "typenum",
+ "version_check",
+]
+
+[[package]]
+name = "groestl-aesni"
+version = "0.3.0"
+dependencies = [
+ "block-buffer",
+ "digest",
+ "lazy_static",
+]
+
+[[package]]
+name = "hex-literal"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "d70693199b3cf4552f3fa720b54163927a3ebed2aef240efaf556033ab336a11"
+dependencies = [
+ "hex-literal-impl",
+ "proc-macro-hack",
+]
+
+[[package]]
+name = "hex-literal-impl"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "59448fc2f82a5fb6907f78c3d69d843e82ff5b051923313cc4438cb0c7b745a8"
+dependencies = [
+ "proc-macro-hack",
+]
+
+[[package]]
+name = "jh-x86_64"
+version = "0.3.0"
+dependencies = [
+ "block-buffer",
+ "cc",
+ "digest",
+ "hex-literal",
+ "ppv-lite86",
+]
+
+[[package]]
+name = "lazy_static"
+version = "1.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
+
+[[package]]
+name = "libm"
+version = "0.2.8"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
+
+[[package]]
+name = "num-traits"
+version = "0.2.19"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "071dfc062690e90b734c0b2273ce72ad0ffa95f0c74596bc250dcfd960262841"
+dependencies = [
+ "autocfg",
+ "libm",
+]
+
+[[package]]
+name = "packed_simd"
+version = "0.3.9"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1f9f08af0c877571712e2e3e686ad79efad9657dbf0f7c3c8ba943ff6c38932d"
+dependencies = [
+ "cfg-if",
+ "num-traits",
+]
+
+[[package]]
+name = "ppv-lite86"
+version = "0.2.17"
+
+[[package]]
+name = "ppv-null"
+version = "0.2.0"
+dependencies = [
+ "crypto-simd 0.1.1",
+]
+
+[[package]]
+name = "proc-macro-hack"
+version = "0.5.20+deprecated"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc375e1527247fe1a97d8b7156678dfe7c1af2fc075c9a4db3690ecd2a148068"
+
+[[package]]
+name = "skein-hash"
+version = "0.3.1"
+dependencies = [
+ "block-buffer",
+ "cipher",
+ "digest",
+ "threefish-cipher",
+]
+
+[[package]]
+name = "threefish-cipher"
+version = "0.4.0"
+dependencies = [
+ "cipher",
+ "hex-literal",
+]
+
+[[package]]
+name = "typenum"
+version = "1.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825"
+
+[[package]]
+name = "version_check"
+version = "0.9.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a"
diff --git a/Cargo.toml b/Cargo.toml
index fb9780d0..174e5c36 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -10,6 +10,7 @@ members = [
     "utils-simd/ppv-lite86",
     "utils-simd/ppv-null",
 ]
+resolver = "2"
 
 [patch.crates-io]
 c2-chacha = { path = "stream-ciphers/chacha" }
diff --git a/LICENSE-APACHE b/LICENSE-APACHE
new file mode 100644
index 00000000..1eb32153
--- /dev/null
+++ b/LICENSE-APACHE
@@ -0,0 +1,201 @@
+                              Apache License
+                        Version 2.0, January 2004
+                     http://www.apache.org/licenses/
+
+TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+1. Definitions.
+
+   "License" shall mean the terms and conditions for use, reproduction,
+   and distribution as defined by Sections 1 through 9 of this document.
+
+   "Licensor" shall mean the copyright owner or entity authorized by
+   the copyright owner that is granting the License.
+
+   "Legal Entity" shall mean the union of the acting entity and all
+   other entities that control, are controlled by, or are under common
+   control with that entity. For the purposes of this definition,
+   "control" means (i) the power, direct or indirect, to cause the
+   direction or management of such entity, whether by contract or
+   otherwise, or (ii) ownership of fifty percent (50%) or more of the
+   outstanding shares, or (iii) beneficial ownership of such entity.
+
+   "You" (or "Your") shall mean an individual or Legal Entity
+   exercising permissions granted by this License.
+
+   "Source" form shall mean the preferred form for making modifications,
+   including but not limited to software source code, documentation
+   source, and configuration files.
+
+   "Object" form shall mean any form resulting from mechanical
+   transformation or translation of a Source form, including but
+   not limited to compiled object code, generated documentation,
+   and conversions to other media types.
+
+   "Work" shall mean the work of authorship, whether in Source or
+   Object form, made available under the License, as indicated by a
+   copyright notice that is included in or attached to the work
+   (an example is provided in the Appendix below).
+
+   "Derivative Works" shall mean any work, whether in Source or Object
+   form, that is based on (or derived from) the Work and for which the
+   editorial revisions, annotations, elaborations, or other modifications
+   represent, as a whole, an original work of authorship. For the purposes
+   of this License, Derivative Works shall not include works that remain
+   separable from, or merely link (or bind by name) to the interfaces of,
+   the Work and Derivative Works thereof.
+
+   "Contribution" shall mean any work of authorship, including
+   the original version of the Work and any modifications or additions
+   to that Work or Derivative Works thereof, that is intentionally
+   submitted to Licensor for inclusion in the Work by the copyright owner
+   or by an individual or Legal Entity authorized to submit on behalf of
+   the copyright owner. For the purposes of this definition, "submitted"
+   means any form of electronic, verbal, or written communication sent
+   to the Licensor or its representatives, including but not limited to
+   communication on electronic mailing lists, source code control systems,
+   and issue tracking systems that are managed by, or on behalf of, the
+   Licensor for the purpose of discussing and improving the Work, but
+   excluding communication that is conspicuously marked or otherwise
+   designated in writing by the copyright owner as "Not a Contribution."
+
+   "Contributor" shall mean Licensor and any individual or Legal Entity
+   on behalf of whom a Contribution has been received by Licensor and
+   subsequently incorporated within the Work.
+
+2. Grant of Copyright License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   copyright license to reproduce, prepare Derivative Works of,
+   publicly display, publicly perform, sublicense, and distribute the
+   Work and such Derivative Works in Source or Object form.
+
+3. Grant of Patent License. Subject to the terms and conditions of
+   this License, each Contributor hereby grants to You a perpetual,
+   worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+   (except as stated in this section) patent license to make, have made,
+   use, offer to sell, sell, import, and otherwise transfer the Work,
+   where such license applies only to those patent claims licensable
+   by such Contributor that are necessarily infringed by their
+   Contribution(s) alone or by combination of their Contribution(s)
+   with the Work to which such Contribution(s) was submitted. If You
+   institute patent litigation against any entity (including a
+   cross-claim or counterclaim in a lawsuit) alleging that the Work
+   or a Contribution incorporated within the Work constitutes direct
+   or contributory patent infringement, then any patent licenses
+   granted to You under this License for that Work shall terminate
+   as of the date such litigation is filed.
+
+4. Redistribution. You may reproduce and distribute copies of the
+   Work or Derivative Works thereof in any medium, with or without
+   modifications, and in Source or Object form, provided that You
+   meet the following conditions:
+
+   (a) You must give any other recipients of the Work or
+       Derivative Works a copy of this License; and
+
+   (b) You must cause any modified files to carry prominent notices
+       stating that You changed the files; and
+
+   (c) You must retain, in the Source form of any Derivative Works
+       that You distribute, all copyright, patent, trademark, and
+       attribution notices from the Source form of the Work,
+       excluding those notices that do not pertain to any part of
+       the Derivative Works; and
+
+   (d) If the Work includes a "NOTICE" text file as part of its
+       distribution, then any Derivative Works that You distribute must
+       include a readable copy of the attribution notices contained
+       within such NOTICE file, excluding those notices that do not
+       pertain to any part of the Derivative Works, in at least one
+       of the following places: within a NOTICE text file distributed
+       as part of the Derivative Works; within the Source form or
+       documentation, if provided along with the Derivative Works; or,
+       within a display generated by the Derivative Works, if and
+       wherever such third-party notices normally appear. The contents
+       of the NOTICE file are for informational purposes only and
+       do not modify the License. You may add Your own attribution
+       notices within Derivative Works that You distribute, alongside
+       or as an addendum to the NOTICE text from the Work, provided
+       that such additional attribution notices cannot be construed
+       as modifying the License.
+
+   You may add Your own copyright statement to Your modifications and
+   may provide additional or different license terms and conditions
+   for use, reproduction, or distribution of Your modifications, or
+   for any such Derivative Works as a whole, provided Your use,
+   reproduction, and distribution of the Work otherwise complies with
+   the conditions stated in this License.
+
+5. Submission of Contributions. Unless You explicitly state otherwise,
+   any Contribution intentionally submitted for inclusion in the Work
+   by You to the Licensor shall be under the terms and conditions of
+   this License, without any additional terms or conditions.
+   Notwithstanding the above, nothing herein shall supersede or modify
+   the terms of any separate license agreement you may have executed
+   with Licensor regarding such Contributions.
+
+6. Trademarks. This License does not grant permission to use the trade
+   names, trademarks, service marks, or product names of the Licensor,
+   except as required for reasonable and customary use in describing the
+   origin of the Work and reproducing the content of the NOTICE file.
+
+7. Disclaimer of Warranty. Unless required by applicable law or
+   agreed to in writing, Licensor provides the Work (and each
+   Contributor provides its Contributions) on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+   implied, including, without limitation, any warranties or conditions
+   of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+   PARTICULAR PURPOSE. You are solely responsible for determining the
+   appropriateness of using or redistributing the Work and assume any
+   risks associated with Your exercise of permissions under this License.
+
+8. Limitation of Liability. In no event and under no legal theory,
+   whether in tort (including negligence), contract, or otherwise,
+   unless required by applicable law (such as deliberate and grossly
+   negligent acts) or agreed to in writing, shall any Contributor be
+   liable to You for damages, including any direct, indirect, special,
+   incidental, or consequential damages of any character arising as a
+   result of this License or out of the use or inability to use the
+   Work (including but not limited to damages for loss of goodwill,
+   work stoppage, computer failure or malfunction, or any and all
+   other commercial damages or losses), even if such Contributor
+   has been advised of the possibility of such damages.
+
+9. Accepting Warranty or Additional Liability. While redistributing
+   the Work or Derivative Works thereof, You may choose to offer,
+   and charge a fee for, acceptance of support, warranty, indemnity,
+   or other liability obligations and/or rights consistent with this
+   License. However, in accepting such obligations, You may act only
+   on Your own behalf and on Your sole responsibility, not on behalf
+   of any other Contributor, and only if You agree to indemnify,
+   defend, and hold each Contributor harmless for any liability
+   incurred by, or claims asserted against, such Contributor by reason
+   of your accepting any such warranty or additional liability.
+
+END OF TERMS AND CONDITIONS
+
+APPENDIX: How to apply the Apache License to your work.
+
+   To apply the Apache License to your work, attach the following
+   boilerplate notice, with the fields enclosed by brackets "[]"
+   replaced with your own identifying information. (Don't include
+   the brackets!)  The text should be enclosed in the appropriate
+   comment syntax for the file format. We also recommend that a
+   file or class name and description of purpose be included on the
+   same "printed page" as the copyright notice for easier
+   identification within third-party archives.
+
+Copyright 2019 The CryptoCorrosion Contributors
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/LICENSE-MIT b/LICENSE-MIT
new file mode 100644
index 00000000..d78c961b
--- /dev/null
+++ b/LICENSE-MIT
@@ -0,0 +1,25 @@
+Copyright (c) 2019 The CryptoCorrosion Contributors
+
+Permission is hereby granted, free of charge, to any
+person obtaining a copy of this software and associated
+documentation files (the "Software"), to deal in the
+Software without restriction, including without
+limitation the rights to use, copy, modify, merge,
+publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software
+is furnished to do so, subject to the following
+conditions:
+
+The above copyright notice and this permission notice
+shall be included in all copies or substantial portions
+of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
+ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
+TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
+PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
+SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
+IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
index 65ab2511..9f2d6e89 100644
--- a/README.md
+++ b/README.md
@@ -6,9 +6,7 @@ The main interface to these crates is the RustCrypto traits.
 
 All crates are no-std compatible.
 
-Minimum Rust version: 1.31.
-
-[![Build Status](https://travis-ci.org/cryptocorrosion/cryptocorrosion.svg?branch=master)](https://travis-ci.org/cryptocorrosion/cryptocorrosion)
+Minimum Rust version: 1.61.0
 
 ## Supported algorithms
 
@@ -36,26 +34,6 @@ runtime CPU detection is not yet supported.
 | ---------- | ---------- | ------------------ |
 | ChaCha     | c2-chacha  | :heavy_check_mark: |
 
-## SIMD
-
-Many of the crates in this project include optimized SIMD implementations,
-enabled by default on x86-64 by the "simd" feature. The fastest implementation
-available for your hardware will be automatically selected at runtime, except
-in no-std builds.
-
-For other hardware platforms, e.g. ARM: an alternative, portable SIMD backend
-based on the packed\_simd crate is available for recent nightly Rust; you can
-enable it as "packed\_simd". 
-
-If you'd prefer to minimize usage of `unsafe` code: disable the "simd" feature
-to switch to a generic implementation.
-
-| feature        | crate        | no `unsafe`        | rust version   | build time? | performance   |
-| -------------- | ------------ | ------------------ | -------------- | ----------- | ------------- |
-| simd (default) | ppv\_lite86  | :x:                | 1.27           | fast        | fast          |
-| (no simd)      | ppv\_null    | :heavy_check_mark: |                | fast        | slow          |
-| packed\_simd   | packed\_simd |                    | recent nightly | slow        | fast          |
-
 ## License
 
 All crates licensed under either of
diff --git a/block-ciphers/threefish/CHANGELOG.md b/block-ciphers/threefish/CHANGELOG.md
new file mode 100644
index 00000000..44b58459
--- /dev/null
+++ b/block-ciphers/threefish/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+### Changed
+- Update `cipher` dependency: 0.2 -> 0.3.
diff --git a/block-ciphers/threefish/Cargo.toml b/block-ciphers/threefish/Cargo.toml
index 122e0b04..192cd77c 100644
--- a/block-ciphers/threefish/Cargo.toml
+++ b/block-ciphers/threefish/Cargo.toml
@@ -1,20 +1,20 @@
 [package]
 name = "threefish-cipher"
-version = "0.3.1"
+version = "0.4.0"
 authors = ["The Rust-Crypto Project Developers", "The Cryptocorrosion Contributors"]
 license = "MIT/Apache-2.0"
 description = "Threefish block cipher"
 documentation = "https://docs.rs/threefish"
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
 keywords = ["crypto", "threefish", "gost", "block-cipher"]
+edition = "2021"
+rust-version = "1.61"
 
 [dependencies]
-generic-array = "0.12"
-byteorder = { version = "1", default-features = false }
-block-cipher-trait = "0.6"
+cipher = "0.3"
 
 [dev-dependencies]
-hex-literal = "0.2"
+hex-literal = "0.3"
 
 [features]
 no_unroll = []
diff --git a/block-ciphers/threefish/src/lib.rs b/block-ciphers/threefish/src/lib.rs
index e5c46d59..19d911ed 100644
--- a/block-ciphers/threefish/src/lib.rs
+++ b/block-ciphers/threefish/src/lib.rs
@@ -1,20 +1,15 @@
 #![no_std]
 #![allow(non_upper_case_globals)]
-extern crate block_cipher_trait;
-extern crate byteorder;
-extern crate generic_array;
-#[cfg(test)]
-#[macro_use]
-extern crate hex_literal;
+
+use core::convert::TryInto;
 use core::ops::BitXor;
 
 mod consts;
 use consts::{C240, P_1024, P_256, P_512, R_1024, R_256, R_512};
 
-use block_cipher_trait::generic_array::typenum::{U1, U128, U32, U64};
-use block_cipher_trait::generic_array::GenericArray;
-pub use block_cipher_trait::BlockCipher;
-use byteorder::{ByteOrder, LE};
+use cipher::generic_array::typenum::{U1, U128, U32, U64};
+use cipher::generic_array::GenericArray;
+use cipher::{BlockCipher, BlockDecrypt, BlockEncrypt, NewBlockCipher};
 
 fn mix(r: u32, x: (u64, u64)) -> (u64, u64) {
     let y0 = x.0.wrapping_add(x.1);
@@ -30,13 +25,13 @@ fn inv_mix(r: u32, y: (u64, u64)) -> (u64, u64) {
 
 fn read_u64v_le(ns: &mut [u64], buf: &[u8]) {
     for (c, n) in buf.chunks_exact(8).zip(ns) {
-        *n = LE::read_u64(c);
+        *n = u64::from_le_bytes(c.try_into().unwrap());
     }
 }
 
 fn write_u64v_le(buf: &mut [u8], ns: &[u64]) {
     for (c, n) in buf.chunks_exact_mut(8).zip(ns) {
-        LE::write_u64(c, *n);
+        c.copy_from_slice(&n.to_le_bytes());
     }
 }
 
@@ -116,19 +111,24 @@ macro_rules! impl_threefish(
                     }
                 }
 
-                $name { sk: sk }
+                $name { sk }
             }
         }
 
-        impl BlockCipher for $name {
-            type BlockSize = $block_size;
+        impl NewBlockCipher for $name {
             type KeySize = $block_size;
-            type ParBlocks = U1;
 
             fn new(key: &GenericArray<u8, $block_size>) -> $name {
                 Self::with_tweak(key, 0, 0)
             }
+        }
 
+        impl BlockCipher for $name {
+            type BlockSize = $block_size;
+            type ParBlocks = U1;
+        }
+
+        impl BlockEncrypt for $name {
             fn encrypt_block(&self, block: &mut GenericArray<u8, Self::BlockSize>)
             {
                 let mut v = [0u64; $n_w];
@@ -162,7 +162,9 @@ macro_rules! impl_threefish(
 
                 write_u64v_le(block, &v[..]);
             }
+        }
 
+        impl BlockDecrypt for $name {
             fn decrypt_block(&self, block: &mut GenericArray<u8, Self::BlockSize>)
             {
                 let mut v = [0u64; $n_w];
@@ -208,8 +210,9 @@ mod test {
     //! tests from NIST submission
 
     use super::{Threefish1024, Threefish256, Threefish512};
-    use block_cipher_trait::generic_array::GenericArray;
-    use block_cipher_trait::BlockCipher;
+    use cipher::generic_array::GenericArray;
+    use cipher::{BlockDecrypt, BlockEncrypt, NewBlockCipher};
+    use hex_literal::hex;
 
     #[test]
     fn test_256() {
diff --git a/ci/script.sh b/ci/script.sh
index 6bf55341..268c5483 100644
--- a/ci/script.sh
+++ b/ci/script.sh
@@ -27,11 +27,25 @@ portable_only() {
     cross test --target $TARGET --release -p c2-chacha -p ppv-lite86
 }
 
+older_msrv_crates() {
+    cross build --target $TARGET -p ppv-lite86
+    cross build --target $TARGET --release -p ppv-lite86
+
+    if [ ! -z $DISABLE_TESTS ]; then
+        return
+    fi
+
+    cross test --target $TARGET -p ppv-lite86
+    cross test --target $TARGET --release -p ppv-lite86
+}
+
 # we don't run the "test phase" when doing deploys
 if [ -z $TRAVIS_TAG ]; then
-    if [ -z $PORTABLE_ONLY ]; then
-        main
-    else
+    if [ -n "$PORTABLE_ONLY" ]; then
         portable_only
+    elif [ -n "$OLDER_MSRV_CRATES" ]; then
+        older_msrv_crates
+    else
+        main
     fi
 fi
diff --git a/hashes/blake/CHANGELOG.md b/hashes/blake/CHANGELOG.md
new file mode 100644
index 00000000..adb13026
--- /dev/null
+++ b/hashes/blake/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.4.1]
+### Changed
+- Update `ppv-lite86` dependency to fix support for non-x86 platforms.
diff --git a/hashes/blake/Cargo.toml b/hashes/blake/Cargo.toml
index d38346b0..c0f37d5b 100644
--- a/hashes/blake/Cargo.toml
+++ b/hashes/blake/Cargo.toml
@@ -1,24 +1,25 @@
 [package]
 name = "blake-hash"
-version = "0.3.2"
+version = "0.4.1"
 authors = ["The CryptoCorrosion Contributors"]
 license = "MIT/Apache-2.0"
 description = "BLAKE hash functions"
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
 keywords = ["crypto", "blake", "hash", "digest"]
 categories = ["cryptography", "no-std"]
+rust-version = "1.61"
 
 [dependencies]
-block-buffer = "0.7"
-digest = "0.8"
-simd = { package = "ppv-lite86", version = "0.2.6", optional = true }
+block-buffer = "0.9"
+digest = "0.9"
+simd = { package = "ppv-lite86", version = "0.2.16", optional = true }
 
 [features]
 default = ["simd", "std"]
 std = []
 
 [dev-dependencies]
-digest = { version = "0.8", features = ["dev"] }
+digest = { version = "0.9", features = ["dev"] }
 
 [badges]
 travis-ci = { repository = "cryptocorrosion/cryptocorrosion" }
diff --git a/hashes/blake/src/lib.rs b/hashes/blake/src/lib.rs
index 3ddf349c..098d088f 100644
--- a/hashes/blake/src/lib.rs
+++ b/hashes/blake/src/lib.rs
@@ -8,13 +8,12 @@ use std as core;
 
 extern crate block_buffer;
 pub extern crate digest;
-#[macro_use]
 pub extern crate simd;
 
 mod consts;
 
-use block_buffer::byteorder::{ByteOrder, BE};
 use block_buffer::BlockBuffer;
+use core::convert::TryInto;
 use core::mem;
 use digest::generic_array::typenum::{PartialDiv, Unsigned, U2};
 use digest::generic_array::GenericArray;
@@ -73,16 +72,17 @@ fn round64<M: Machine>(
 
 #[inline(always)]
 fn diagonalize<X4: Words4>((a, b, c, d): (X4, X4, X4, X4)) -> (X4, X4, X4, X4) {
-    (a, b.shuffle3012(), c.shuffle2301(), d.shuffle1230())
+    // Since b has the critical data dependency, avoid rotating b to hide latency.
+    (a.shuffle1230(), b, c.shuffle3012(), d.shuffle2301())
 }
 
 #[inline(always)]
 fn undiagonalize<X4: Words4>((a, b, c, d): (X4, X4, X4, X4)) -> (X4, X4, X4, X4) {
-    (a, b.shuffle1230(), c.shuffle2301(), d.shuffle3012())
+    (a.shuffle3012(), b, c.shuffle1230(), d.shuffle2301())
 }
 
 macro_rules! define_compressor {
-    ($compressor:ident, $storage:ident, $word:ident, $Bufsz:ty, $deserializer:path, $uval:expr, $rounds:expr, $round:ident, $X4:ident) => {
+    ($compressor:ident, $storage:ident, $word:ident, $Bufsz:ty, $uval:expr, $rounds:expr, $round:ident, $X4:ident) => {
         #[derive(Clone, Copy, Default)]
         pub struct $compressor {
             h: [$storage; 2],
@@ -100,7 +100,7 @@ macro_rules! define_compressor {
                     .iter_mut()
                     .zip(block.chunks_exact(mem::size_of::<$word>()))
                 {
-                    *mx = $deserializer(b);
+                    *mx = $word::from_be_bytes(b.try_into().unwrap());
                 }
 
                 let u = (mach.vec([U[0], U[1], U[2], U[3]]), mach.vec([U[4], U[5], U[6], U[7]]));
@@ -114,8 +114,8 @@ macro_rules! define_compressor {
                     let m1 = mach.vec([m1!(0), m1!(2), m1!(4), m1!(6)]);
                     xs = $round::<M>(xs, m0, m1);
                     // diagonal step
-                    let m0 = mach.vec([m0!(8), m0!(10), m0!(12), m0!(14)]);
-                    let m1 = mach.vec([m1!(8), m1!(10), m1!(12), m1!(14)]);
+                    let m0 = mach.vec([m0!(14), m0!(8), m0!(10), m0!(12)]);
+                    let m1 = mach.vec([m1!(14), m1!(8), m1!(10), m1!(12)]);
                     xs = undiagonalize($round::<M>(diagonalize(xs), m0, m1));
                 }
                 let h: (M::$X4, M::$X4) = (mach.unpack(state.h[0]), mach.unpack(state.h[1]));
@@ -156,7 +156,7 @@ macro_rules! define_compressor {
 
 macro_rules! define_hasher {
     ($name:ident, $word:ident, $buf:expr, $Bufsz:ty, $bits:expr, $Bytes:ident,
-     $serializer:path, $compressor:ident, $iv:expr) => {
+     $compressor:ident, $iv:expr) => {
         #[derive(Clone)]
         pub struct $name {
             compressor: $compressor,
@@ -196,30 +196,30 @@ macro_rules! define_hasher {
             type BlockSize = $Bytes;
         }
 
-        impl digest::Input for $name {
-            fn input<T: AsRef<[u8]>>(&mut self, data: T) {
+        impl digest::Update for $name {
+            fn update(&mut self, data: impl AsRef<[u8]>) {
                 let compressor = &mut self.compressor;
                 let t = &mut self.t;
-                self.buffer.input(data.as_ref(), |block| {
+                self.buffer.input_block(data.as_ref(), |block| {
                     Self::increase_count(t, (mem::size_of::<$word>() * 16) as $word);
                     compressor.put_block(block, *t);
                 });
             }
         }
 
-        impl digest::FixedOutput for $name {
+        impl digest::FixedOutputDirty for $name {
             type OutputSize = $Bytes;
 
-            fn fixed_result(self) -> GenericArray<u8, $Bytes> {
+            fn finalize_into_dirty(&mut self, out: &mut GenericArray<u8, Self::OutputSize>) {
                 let mut compressor = self.compressor;
-                let mut buffer = self.buffer;
+                let buffer = &mut self.buffer;
                 let mut t = self.t;
 
                 Self::increase_count(&mut t, buffer.position() as $word);
 
                 let mut msglen = [0u8; $buf / 8];
-                $serializer(&mut msglen[..$buf / 16], t.1);
-                $serializer(&mut msglen[$buf / 16..], t.0);
+                msglen[..$buf / 16].copy_from_slice(&t.1.to_be_bytes());
+                msglen[$buf / 16..].copy_from_slice(&t.0.to_be_bytes());
 
                 let footerlen = 1 + 2 * mem::size_of::<$word>();
 
@@ -237,7 +237,7 @@ macro_rules! define_hasher {
                 let extra_block = buffer.position() + footerlen > $buf;
                 if extra_block {
                     let pad = $buf - buffer.position();
-                    buffer.input(&PADDING[..pad], |block| compressor.put_block(block, t));
+                    buffer.input_block(&PADDING[..pad], |block| compressor.put_block(block, t));
                     debug_assert_eq!(buffer.position(), 0);
                 }
 
@@ -249,12 +249,12 @@ macro_rules! define_hasher {
                 // skip begin-padding byte if continuing padding
                 let x = extra_block as usize;
                 let (start, end) = (x, x + ($buf - footerlen - buffer.position()));
-                buffer.input(&PADDING[start..end], |_| unreachable!());
-                buffer.input(&[magic], |_| unreachable!());
-                buffer.input(&msglen, |block| compressor.put_block(block, t));
+                buffer.input_block(&PADDING[start..end], |_| unreachable!());
+                buffer.input_block(&[magic], |_| unreachable!());
+                buffer.input_block(&msglen, |block| compressor.put_block(block, t));
                 debug_assert_eq!(buffer.position(), 0);
 
-                GenericArray::clone_from_slice(&compressor.finalize()[..$Bytes::to_usize()])
+                out.copy_from_slice(&compressor.finalize()[..$Bytes::to_usize()]);
             }
         }
 
@@ -272,19 +272,19 @@ use consts::{
 use digest::generic_array::typenum::{U128, U28, U32, U48, U64};
 
 #[rustfmt::skip]
-define_compressor!(Compressor256, vec128_storage, u32, U64, BE::read_u32, BLAKE256_U, 14, round32, u32x4);
+define_compressor!(Compressor256, vec128_storage, u32, U64, BLAKE256_U, 14, round32, u32x4);
 
 #[rustfmt::skip]
-define_hasher!(Blake224, u32, 64, U64, 224, U28, BE::write_u32, Compressor256, BLAKE224_IV);
+define_hasher!(Blake224, u32, 64, U64, 224, U28, Compressor256, BLAKE224_IV);
 
 #[rustfmt::skip]
-define_hasher!(Blake256, u32, 64, U64, 256, U32, BE::write_u32, Compressor256, BLAKE256_IV);
+define_hasher!(Blake256, u32, 64, U64, 256, U32, Compressor256, BLAKE256_IV);
 
 #[rustfmt::skip]
-define_compressor!(Compressor512, vec256_storage, u64, U128, BE::read_u64, BLAKE512_U, 16, round64, u64x4);
+define_compressor!(Compressor512, vec256_storage, u64, U128, BLAKE512_U, 16, round64, u64x4);
 
 #[rustfmt::skip]
-define_hasher!(Blake384, u64, 128, U128, 384, U48, BE::write_u64, Compressor512, BLAKE384_IV);
+define_hasher!(Blake384, u64, 128, U128, 384, U48, Compressor512, BLAKE384_IV);
 
 #[rustfmt::skip]
-define_hasher!(Blake512, u64, 128, U128, 512, U64, BE::write_u64, Compressor512, BLAKE512_IV);
+define_hasher!(Blake512, u64, 128, U128, 512, U64, Compressor512, BLAKE512_IV);
diff --git a/hashes/blake/tests/lib.rs b/hashes/blake/tests/lib.rs
index 856a1861..a52edbf9 100644
--- a/hashes/blake/tests/lib.rs
+++ b/hashes/blake/tests/lib.rs
@@ -6,5 +6,5 @@ use digest::dev::digest_test;
 
 new_test!(blake224, "blake224", blake_hash::Blake224, digest_test);
 new_test!(blake256, "blake256", blake_hash::Blake256, digest_test);
-new_test!(blake384, "blake384", blake_hash::Blake384, digest_test);
-new_test!(blake512, "blake512", blake_hash::Blake512, digest_test);
+//new_test!(blake384, "blake384", blake_hash::Blake384, digest_test);
+//new_test!(blake512, "blake512", blake_hash::Blake512, digest_test);
diff --git a/hashes/groestl/Cargo.toml b/hashes/groestl/Cargo.toml
index dc0b8469..41b0edbb 100644
--- a/hashes/groestl/Cargo.toml
+++ b/hashes/groestl/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "groestl-aesni"
-version = "0.2.2"
+version = "0.3.1"
 authors = ["The CryptoCorrosion Contributors"]
 license = "MIT/Apache-2.0"
 description = "Hardware-accelerated Groestl hash for x86-64 systems with AES extensions"
@@ -8,15 +8,17 @@ documentation = "https://docs.rs/groestl-aesni"
 keywords = ["crypto", "groestl", "hash", "digest"]
 categories = ["cryptography", "no-std"]
 repository = "https://github.com/cryptocorrosion/hashes"
-edition = "2018"
+edition = "2021"
+rust-version = "1.61"
 
 [dependencies]
-block-buffer = "0.7"
-digest = "0.8"
+block-buffer = "0.9"
+digest = "0.9"
 lazy_static = { version = "1.2", optional = true }
+zerocopy = { version = "0.7", features = ["simd", "derive"] }
 
 [dev-dependencies]
-digest = { version = "0.8", features = ["dev"] }
+digest = { version = "0.9", features = ["dev"] }
 
 [features]
 std = ["lazy_static"]
diff --git a/hashes/groestl/src/compressor.rs b/hashes/groestl/src/compressor.rs
index d20da74d..fc6b5fd4 100644
--- a/hashes/groestl/src/compressor.rs
+++ b/hashes/groestl/src/compressor.rs
@@ -2,6 +2,7 @@ use block_buffer::generic_array::typenum::{U128, U64};
 use block_buffer::generic_array::GenericArray;
 use core::arch::x86_64::*;
 use core::ops::BitXor;
+use zerocopy::{AsBytes, FromBytes, FromZeroes};
 
 trait Map2 {
     type Output;
@@ -11,7 +12,8 @@ trait Map2 {
         Self: Sized;
 }
 
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
+#[repr(C)]
 pub struct X4(__m128i, __m128i, __m128i, __m128i);
 
 #[derive(Copy, Clone)]
@@ -47,7 +49,7 @@ impl BitXor for X4 {
 impl Map2 for (X4, X4) {
     type Output = X4;
     #[inline(always)]
-    fn map<F>(self: Self, mut f: F) -> Self::Output
+    fn map<F>(self, mut f: F) -> Self::Output
     where
         F: FnMut(__m128i, __m128i) -> __m128i,
     {
@@ -115,7 +117,7 @@ impl BitXor for X8 {
 impl Map2 for (X8, X8) {
     type Output = X8;
     #[inline(always)]
-    fn map<F>(self: Self, mut f: F) -> Self::Output
+    fn map<F>(self, mut f: F) -> Self::Output
     where
         F: FnMut(__m128i, __m128i) -> __m128i,
     {
@@ -175,7 +177,8 @@ unsafe fn transpose_a(i: X4) -> X4 {
         _mm_unpackhi_epi16(i.0, i.1),
         _mm_unpacklo_epi16(i.2, i.3),
         _mm_unpackhi_epi16(i.2, i.3),
-    ).map(|x| _mm_shuffle_epi32(x, 0b1101_1000));
+    )
+    .map(|x| _mm_shuffle_epi32(x, 0b1101_1000));
     X4(
         _mm_unpacklo_epi32(z.0, z.2),
         _mm_unpacklo_epi32(z.1, z.3),
@@ -367,7 +370,8 @@ unsafe fn transpose_inv(i: X8) -> X8 {
         _mm_unpackhi_epi64(i.4, i.5),
         _mm_unpacklo_epi64(i.6, i.7),
         _mm_unpackhi_epi64(i.6, i.7),
-    ).map(|x| {
+    )
+    .map(|x| {
         _mm_shuffle_epi8(
             x,
             _mm_set_epi64x(0x0f07_0b03_0e06_0a02, 0x0d05_0901_0c04_0800),
@@ -382,7 +386,8 @@ unsafe fn transpose_inv(i: X8) -> X8 {
         _mm_unpacklo_epi16(i.5, i.7),
         _mm_unpackhi_epi16(i.4, i.6),
         _mm_unpackhi_epi16(i.5, i.7),
-    ).map(|x| _mm_shuffle_epi32(x, 0b1101_1000));
+    )
+    .map(|x| _mm_shuffle_epi32(x, 0b1101_1000));
     X8(
         _mm_unpacklo_epi32(i.0, i.4),
         _mm_unpacklo_epi32(i.2, i.6),
@@ -446,7 +451,8 @@ unsafe fn rounds_q(mut x: X8) -> X8 {
         _mm_set_epi64x(0x080b_0e01_0407_0a0d, 0x0003_0609_0c0f_0205),
         _mm_set_epi64x(0x090c_0f02_0508_0b0e, 0x0104_070a_0d00_0306),
         _mm_set_epi64x(0x0e01_0407_0a0d_0003, 0x0609_0c0f_0205_080b),
-    ).shuffle((1, 3, 5, 7, 0, 2, 4, 6));
+    )
+    .shuffle((1, 3, 5, 7, 0, 2, 4, 6));
     let f = _mm_set1_epi64x(0xffff_ffff_ffff_ffffu64 as i64);
     for q in const_q.chunks_exact(2) {
         // 2 rounds at a time so we can flip-flop between register sets
diff --git a/hashes/groestl/src/lib.rs b/hashes/groestl/src/lib.rs
index 212a464f..535ccc2c 100644
--- a/hashes/groestl/src/lib.rs
+++ b/hashes/groestl/src/lib.rs
@@ -10,7 +10,6 @@ pub extern crate digest;
 #[macro_use]
 extern crate lazy_static;
 
-use block_buffer::byteorder::{BigEndian, ByteOrder, LE};
 use block_buffer::generic_array::typenum::{
     PartialDiv, Unsigned, U1024, U128, U16, U28, U48, U512, U64, U8,
 };
@@ -19,6 +18,7 @@ use block_buffer::BlockBuffer;
 use core::fmt::{Debug, Formatter, Result};
 use digest::generic_array::GenericArray as DGenericArray;
 pub use digest::Digest;
+use zerocopy::transmute;
 
 mod compressor;
 use crate::compressor::{init1024, init512, of1024, of512, tf1024, tf512};
@@ -27,25 +27,21 @@ use crate::compressor::{init1024, init512, of1024, of512, tf1024, tf512};
 struct Align16<T>(T);
 
 type Block512 = [u64; 512 / 64];
-union CvBytes512 {
-    block: Block512,
-    cv: compressor::X4,
-}
 #[derive(Clone)]
 struct Compressor512 {
     cv: compressor::X4,
 }
 impl Compressor512 {
     fn new(block: Block512) -> Self {
-        let cv = init512(unsafe { CvBytes512 { block }.cv });
+        let cv = init512(transmute!(block));
         Compressor512 { cv }
     }
     fn input(&mut self, data: &BBGenericArray<u8, U64>) {
         tf512(&mut self.cv, data);
     }
-    fn finalize(mut self) -> Block512 {
+    fn finalize_dirty(&mut self) -> Block512 {
         of512(&mut self.cv);
-        unsafe { CvBytes512 { cv: self.cv }.block }
+        transmute!(self.cv)
     }
 }
 
@@ -66,7 +62,7 @@ impl Compressor1024 {
     fn input(&mut self, data: &BBGenericArray<u8, U128>) {
         tf1024(&mut self.cv, data);
     }
-    fn finalize(mut self) -> Block1024 {
+    fn finalize_dirty(&mut self) -> Block1024 {
         of1024(&mut self.cv);
         unsafe { CvBytes1024 { cv: self.cv }.block }
     }
@@ -87,16 +83,16 @@ macro_rules! impl_digest {
                 let compressor = $compressor::new(iv.0);
                 Self {
                     buffer: BlockBuffer::default(),
-                    compressor: compressor,
+                    compressor,
                     block_counter: 0,
                 }
             }
-            fn finalize(self) -> [u64; $bits::USIZE / 64] {
-                let mut buffer = self.buffer;
-                let mut compressor = self.compressor;
+            fn finalize_dirty(&mut self) -> [u64; $bits::USIZE / 64] {
+                let buffer = &mut self.buffer;
+                let compressor = &mut self.compressor;
                 let count = self.block_counter + 1 + (buffer.remaining() <= 8) as u64;
-                buffer.len64_padding::<BigEndian, _>(count, |b| compressor.input(b));
-                compressor.finalize()
+                buffer.len64_padding_be(count, |b| compressor.input(b));
+                compressor.finalize_dirty()
             }
         }
         impl Default for $groestl {
@@ -112,25 +108,23 @@ macro_rules! impl_digest {
         impl digest::BlockInput for $groestl {
             type BlockSize = <$bits as PartialDiv<U8>>::Output;
         }
-        impl digest::Input for $groestl {
-            fn input<T: AsRef<[u8]>>(&mut self, data: T) {
+        impl digest::Update for $groestl {
+            fn update(&mut self, data: impl AsRef<[u8]>) {
                 let block_counter = &mut self.block_counter;
                 let compressor = &mut self.compressor;
-                self.buffer.input(data.as_ref(), |b| {
+                self.buffer.input_block(data.as_ref(), |b| {
                     *block_counter += 1;
                     compressor.input(b)
                 });
             }
         }
-        impl digest::FixedOutput for $groestl {
+        impl digest::FixedOutputDirty for $groestl {
             type OutputSize = <$bits as PartialDiv<U16>>::Output;
-            fn fixed_result(self) -> DGenericArray<u8, Self::OutputSize> {
-                let result = self.finalize();
-                let mut out: DGenericArray<u8, Self::OutputSize> = DGenericArray::default();
+            fn finalize_into_dirty(&mut self, out: &mut DGenericArray<u8, Self::OutputSize>) {
+                let result = self.finalize_dirty();
                 for (out, &input) in out.chunks_exact_mut(8).zip(&result[$bits::USIZE / 128..]) {
-                    LE::write_u64(out, input);
+                    out.copy_from_slice(&input.to_le_bytes());
                 }
-                out
             }
         }
         impl digest::Reset for $groestl {
@@ -154,21 +148,19 @@ impl Default for Groestl224 {
 impl digest::BlockInput for Groestl224 {
     type BlockSize = U64;
 }
-impl digest::Input for Groestl224 {
-    fn input<T: AsRef<[u8]>>(&mut self, data: T) {
-        digest::Input::input(&mut self.0, data.as_ref());
+impl digest::Update for Groestl224 {
+    fn update(&mut self, data: impl AsRef<[u8]>) {
+        digest::Update::update(&mut self.0, data.as_ref());
     }
 }
-impl digest::FixedOutput for Groestl224 {
+impl digest::FixedOutputDirty for Groestl224 {
     type OutputSize = U28;
-    fn fixed_result(self) -> DGenericArray<u8, U28> {
-        let result = self.0.finalize();
-        let mut out: DGenericArray<u8, U28> = DGenericArray::default();
-        LE::write_u32(&mut out[..4], (result[4] >> 32) as u32);
+    fn finalize_into_dirty(&mut self, out: &mut DGenericArray<u8, Self::OutputSize>) {
+        let result = self.0.finalize_dirty();
+        out[..4].copy_from_slice(&((result[4] >> 32) as u32).to_le_bytes());
         for (out, &input) in out[4..].chunks_exact_mut(8).zip(&result[5..8]) {
-            LE::write_u64(out, input);
+            out.copy_from_slice(&input.to_le_bytes());
         }
-        out
     }
 }
 impl digest::Reset for Groestl224 {
@@ -187,20 +179,18 @@ impl Default for Groestl384 {
 impl digest::BlockInput for Groestl384 {
     type BlockSize = <Groestl512 as digest::BlockInput>::BlockSize;
 }
-impl digest::Input for Groestl384 {
-    fn input<T: AsRef<[u8]>>(&mut self, data: T) {
-        digest::Input::input(&mut self.0, data.as_ref());
+impl digest::Update for Groestl384 {
+    fn update(&mut self, data: impl AsRef<[u8]>) {
+        digest::Update::update(&mut self.0, data.as_ref());
     }
 }
-impl digest::FixedOutput for Groestl384 {
+impl digest::FixedOutputDirty for Groestl384 {
     type OutputSize = U48;
-    fn fixed_result(self) -> DGenericArray<u8, U48> {
-        let result = self.0.finalize();
-        let mut out: DGenericArray<u8, U48> = DGenericArray::default();
+    fn finalize_into_dirty(&mut self, out: &mut DGenericArray<u8, Self::OutputSize>) {
+        let result = self.0.finalize_dirty();
         for (out, &input) in out.chunks_exact_mut(8).zip(&result[10..]) {
-            LE::write_u64(out, input);
+            out.copy_from_slice(&input.to_le_bytes());
         }
-        out
     }
 }
 impl digest::Reset for Groestl384 {
diff --git a/hashes/jh/Cargo.toml b/hashes/jh/Cargo.toml
index e8bb569f..42d02b6a 100644
--- a/hashes/jh/Cargo.toml
+++ b/hashes/jh/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "jh-x86_64"
-version = "0.2.2"
+version = "0.3.1"
 authors = ["The CryptoCorrosion Contributors"]
 license = "MIT/Apache-2.0"
 description = "Portable JH with optimizations for x86-64 cpus"
@@ -8,16 +8,18 @@ documentation = "https://docs.rs/jh-x86_64"
 keywords = ["crypto", "jh", "hash", "digest"]
 categories = ["cryptography", "no-std"]
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
-edition = "2018"
+edition = "2021"
+rust-version = "1.61"
 
 [dependencies]
-block-buffer = "0.7"
-digest = "0.8"
-hex-literal = "0.2"
+block-buffer = { version = "0.9", features = ["block-padding"] }
+digest = "0.9"
+hex-literal = "0.3"
 simd = { package = "ppv-lite86", version = "0.2.6" }
+zerocopy = "0.7"
 
 [dev-dependencies]
-digest = { version = "0.8", features = ["dev"] }
+digest = { version = "0.9", features = ["dev"] }
 
 [build-dependencies]
 cc = "1.0.3"
diff --git a/hashes/jh/src/compressor.rs b/hashes/jh/src/compressor.rs
index 57b02cd7..eaf885d7 100644
--- a/hashes/jh/src/compressor.rs
+++ b/hashes/jh/src/compressor.rs
@@ -4,6 +4,7 @@ use core::ptr;
 use digest::generic_array::typenum::U64;
 use digest::generic_array::GenericArray;
 use simd::{vec128_storage, AndNot, Machine, Swap64, VZip, Vec2};
+use zerocopy::transmute;
 
 const E8_BITSLICE_ROUNDCONSTANT: [[u8; 32]; 42] = [
     hex!("72d5dea2df15f8677b84150ab723155781abd6904d5a87f64e9f4fc5c3d12b40"),
@@ -199,21 +200,23 @@ dispatch!(mach, M, {
 });
 
 #[derive(Clone, Copy)]
-pub union Compressor {
+pub struct Compressor {
     cv: [vec128_storage; 8],
-    bytes: [u8; 128],
 }
+
 impl Compressor {
     #[inline]
     pub fn new(bytes: [u8; 128]) -> Self {
-        Compressor { bytes }
+        Compressor {
+            cv: transmute!(bytes),
+        }
     }
     #[inline]
     pub fn input(&mut self, data: &GenericArray<u8, U64>) {
-        f8(unsafe { &mut self.cv }, data.as_ptr());
+        f8(&mut self.cv, data.as_ptr())
     }
     #[inline]
     pub fn finalize(self) -> [u8; 128] {
-        unsafe { self.bytes }
+        transmute!(self.cv)
     }
 }
diff --git a/hashes/jh/src/lib.rs b/hashes/jh/src/lib.rs
index 1cded6db..a04049d5 100644
--- a/hashes/jh/src/lib.rs
+++ b/hashes/jh/src/lib.rs
@@ -15,11 +15,10 @@ mod consts;
 
 pub use digest::Digest;
 
-use block_buffer::byteorder::{BigEndian, ByteOrder};
+use crate::compressor::Compressor;
 use block_buffer::generic_array::GenericArray as BBGenericArray;
 use block_buffer::BlockBuffer;
 use core::fmt::{Debug, Formatter, Result};
-use crate::compressor::Compressor;
 use digest::generic_array::typenum::{Unsigned, U28, U32, U48, U64};
 use digest::generic_array::GenericArray as DGenericArray;
 
@@ -56,33 +55,33 @@ macro_rules! define_hasher {
             type BlockSize = U64;
         }
 
-        impl digest::Input for $name {
-            fn input<T: AsRef<[u8]>>(&mut self, data: T) {
+        impl digest::Update for $name {
+            fn update(&mut self, data: impl AsRef<[u8]>) {
                 let data = data.as_ref();
                 self.datalen += data.len();
                 let state = &mut self.state;
-                self.buffer.input(data, |b| state.input(b))
+                self.buffer.input_block(data, |b| state.input(b))
             }
         }
 
-        impl digest::FixedOutput for $name {
+        impl digest::FixedOutputDirty for $name {
             type OutputSize = $OutputBytes;
 
-            fn fixed_result(mut self) -> DGenericArray<u8, $OutputBytes> {
+            fn finalize_into_dirty(&mut self, out: &mut DGenericArray<u8, Self::OutputSize>) {
                 let state = &mut self.state;
                 let buffer = &mut self.buffer;
                 let len = self.datalen as u64 * 8;
                 if buffer.position() == 0 {
-                    buffer.len64_padding::<BigEndian, _>(len, |b| state.input(b));
+                    buffer.len64_padding_be(len, |b| state.input(b));
                 } else {
                     use block_buffer::block_padding::Iso7816;
                     state.input(buffer.pad_with::<Iso7816>().unwrap());
                     let mut last = BBGenericArray::default();
-                    BigEndian::write_u64(&mut last[56..], len);
+                    last[56..].copy_from_slice(&len.to_be_bytes());
                     state.input(&last);
                 }
                 let finalized = self.state.finalize();
-                DGenericArray::clone_from_slice(&finalized[(128 - $OutputBytes::to_usize())..])
+                out.copy_from_slice(&finalized[(128 - $OutputBytes::to_usize())..]);
             }
         }
 
diff --git a/hashes/skein/CHANGELOG.md b/hashes/skein/CHANGELOG.md
new file mode 100644
index 00000000..44b58459
--- /dev/null
+++ b/hashes/skein/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+### Changed
+- Update `cipher` dependency: 0.2 -> 0.3.
diff --git a/hashes/skein/Cargo.toml b/hashes/skein/Cargo.toml
index e5a85ea8..cb4ec249 100644
--- a/hashes/skein/Cargo.toml
+++ b/hashes/skein/Cargo.toml
@@ -1,18 +1,20 @@
 [package]
 name = "skein-hash"
-version = "0.3.0"
+version = "0.3.2"
 authors = ["The CryptoCorrosion Contributors"]
 license = "MIT/Apache-2.0"
 description = "Skein hash functions"
 repository = "https://github.com/cryptocorrosion/hashes"
 keywords = ["crypto", "skein", "hash", "digest"]
 categories = ["cryptography", "no-std"]
+edition = "2021"
+rust-version = "1.61"
 
 [dependencies]
-block-buffer = "0.7"
-block-padding = "0.1.0"
-digest = "0.8"
-threefish-cipher = "0.3"
+block-buffer = { version = "0.9", features = ["block-padding"] }
+digest = "0.9"
+threefish-cipher = "0.4"
+cipher = "0.3"
 
 [dev-dependencies]
-digest = { version = "0.8", features = ["dev"] }
+digest = { version = "0.9", features = ["dev"] }
diff --git a/hashes/skein/src/lib.rs b/hashes/skein/src/lib.rs
index edf82eef..0252aac0 100644
--- a/hashes/skein/src/lib.rs
+++ b/hashes/skein/src/lib.rs
@@ -3,22 +3,22 @@
 #![no_std]
 
 extern crate block_buffer;
-extern crate block_padding;
 pub extern crate digest;
 extern crate threefish_cipher;
 
 pub use digest::generic_array::GenericArray;
 pub use digest::Digest;
 
-use block_buffer::byteorder::{ByteOrder, LE};
+use block_buffer::block_padding::ZeroPadding;
 use block_buffer::BlockBuffer;
-use block_padding::ZeroPadding;
+use cipher::{BlockCipher, BlockEncrypt};
 use digest::generic_array::typenum::{NonZero, PartialDiv, Unsigned, U128, U32, U64, U8};
 use digest::generic_array::ArrayLength;
-use threefish_cipher::{BlockCipher, Threefish1024, Threefish256, Threefish512};
+use threefish_cipher::{Threefish1024, Threefish256, Threefish512};
 
 /// N word buffer.
 #[derive(Copy, Clone)]
+#[repr(C)]
 union Block<N>
 where
     N: ArrayLength<u8>,
@@ -44,13 +44,45 @@ where
     }
 
     fn as_byte_array(&self) -> &GenericArray<u8, N> {
+        // SAFETY: Both fields of this union have the same layout and bit
+        // validity, so it's okay to treat either field as the other field's
+        // type. Since the union is `repr(C)`, they both live in the same byte
+        // range. (One exception: They don't have the same alignment, but the
+        // alignment of the entire union is the greater of their alignments, so
+        // this isn't an issue.)
         unsafe { &self.bytes }
     }
 
     fn as_byte_array_mut(&mut self) -> &mut GenericArray<u8, N> {
+        // SAFETY: Both fields of this union have the same layout and bit
+        // validity, so it's okay to treat either field as the other field's
+        // type. Since the union is `repr(C)`, they both live in the same byte
+        // range. (One exception: They don't have the same alignment, but the
+        // alignment of the entire union is the greater of their alignments, so
+        // this isn't an issue.)
         unsafe { &mut self.bytes }
     }
 
+    fn as_word_array(&self) -> &GenericArray<u64, <N as PartialDiv<U8>>::Output> {
+        // SAFETY: Both fields of this union have the same layout and bit
+        // validity, so it's okay to treat either field as the other field's
+        // type. Since the union is `repr(C)`, they both live in the same byte
+        // range. (One exception: They don't have the same alignment, but the
+        // alignment of the entire union is the greater of their alignments, so
+        // this isn't an issue.)
+        unsafe { &self.words }
+    }
+
+    fn as_word_array_mut(&mut self) -> &mut GenericArray<u64, <N as PartialDiv<U8>>::Output> {
+        // SAFETY: Both fields of this union have the same layout and bit
+        // validity, so it's okay to treat either field as the other field's
+        // type. Since the union is `repr(C)`, they both live in the same byte
+        // range. (One exception: They don't have the same alignment, but the
+        // alignment of the entire union is the greater of their alignments, so
+        // this isn't an issue.)
+        unsafe { &mut self.words }
+    }
+
     fn from_byte_array(block: &GenericArray<u8, N>) -> Self {
         Block { bytes: *block }
     }
@@ -82,10 +114,7 @@ where
     type Output = Block<N>;
     fn bitxor(mut self, rhs: Block<N>) -> Self::Output {
         // XOR is endian-agnostic
-        for (s, r) in unsafe { &mut self.words }
-            .iter_mut()
-            .zip(unsafe { &rhs.words })
-        {
+        for (s, r) in self.as_word_array_mut().iter_mut().zip(rhs.as_word_array()) {
             *s ^= *r;
         }
         self
@@ -176,9 +205,9 @@ macro_rules! define_hasher {
                     Block::default(),
                 );
                 let mut cfg = GenericArray::<u8, $state_bytes>::default();
-                LE::write_u64(&mut cfg[..8], SCHEMA_VER);
-                LE::write_u64(&mut cfg[8..16], N::to_u64() * 8);
-                LE::write_u64(&mut cfg[16..24], CFG_TREE_INFO_SEQUENTIAL);
+                cfg[..8].copy_from_slice(&SCHEMA_VER.to_le_bytes());
+                cfg[8..16].copy_from_slice(&(N::to_u64() * 8).to_le_bytes());
+                cfg[16..24].copy_from_slice(&CFG_TREE_INFO_SEQUENTIAL.to_le_bytes());
                 Self::process_block(&mut state, &cfg, CFG_STR_LEN);
 
                 // The chaining vars ctx->X are now initialized for the given hashBitLen.
@@ -200,11 +229,11 @@ macro_rules! define_hasher {
             type BlockSize = <$threefish as BlockCipher>::BlockSize;
         }
 
-        impl<N> digest::Input for $name<N>
+        impl<N> digest::Update for $name<N>
         where
             N: Unsigned + ArrayLength<u8> + NonZero + Default,
         {
-            fn input<T: AsRef<[u8]>>(&mut self, data: T) {
+            fn update(&mut self, data: impl AsRef<[u8]>) {
                 let buffer = &mut self.buffer;
                 let state = &mut self.state;
                 buffer.input_lazy(data.as_ref(), |block| {
@@ -213,32 +242,30 @@ macro_rules! define_hasher {
             }
         }
 
-        impl<N> digest::FixedOutput for $name<N>
+        impl<N> digest::FixedOutputDirty for $name<N>
         where
             N: Unsigned + ArrayLength<u8> + NonZero + Default,
         {
             type OutputSize = N;
 
-            fn fixed_result(mut self) -> GenericArray<u8, N> {
+            fn finalize_into_dirty(&mut self, output: &mut GenericArray<u8, Self::OutputSize>) {
                 self.state.t.1 |= T1_FLAG_FINAL;
                 let pos = self.buffer.position();
                 let final_block = self.buffer.pad_with::<ZeroPadding>().unwrap();
                 Self::process_block(&mut self.state, final_block, pos);
 
                 // run Threefish in "counter mode" to generate output
-                let mut output = GenericArray::default();
                 for (i, chunk) in output.chunks_mut($state_bits / 8).enumerate() {
                     let mut ctr = State::new(
                         T1_FLAG_FIRST | T1_BLK_TYPE_OUT | T1_FLAG_FINAL,
                         self.state.x,
                     );
                     let mut b = GenericArray::<u8, $state_bytes>::default();
-                    LE::write_u64(&mut b[..8], i as u64);
+                    b[..8].copy_from_slice(&(i as u64).to_le_bytes());
                     Self::process_block(&mut ctr, &b, 8);
                     let n = chunk.len();
                     chunk.copy_from_slice(&ctr.x.bytes()[..n]);
                 }
-                output
             }
         }
 
diff --git a/stream-ciphers/chacha/CHANGELOG.md b/stream-ciphers/chacha/CHANGELOG.md
new file mode 100644
index 00000000..44b58459
--- /dev/null
+++ b/stream-ciphers/chacha/CHANGELOG.md
@@ -0,0 +1,9 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+### Changed
+- Update `cipher` dependency: 0.2 -> 0.3.
diff --git a/stream-ciphers/chacha/Cargo.toml b/stream-ciphers/chacha/Cargo.toml
index 0fccf0d4..591c1733 100644
--- a/stream-ciphers/chacha/Cargo.toml
+++ b/stream-ciphers/chacha/Cargo.toml
@@ -1,29 +1,30 @@
 [package]
 name = "c2-chacha"
-version = "0.2.3"
+version = "0.3.3"
 authors = ["The CryptoCorrosion Contributors"]
 license = "MIT/Apache-2.0"
-edition = "2018"
+edition = "2021"
 description = "The ChaCha family of stream ciphers"
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
 keywords = ["chacha", "chacha20", "xchacha20", "cipher", "crypto"]
 categories = ["cryptography", "no-std"]
 readme = "README.md"
 documentation = "https://docs.rs/c2-chacha"
+rust-version = "1.61"
 
 [dependencies]
-byteorder = { version = "1.3", optional = true }
-ppv-lite86 = { package = "ppv-lite86", version = "0.2.6", default-features = false }
-stream-cipher = { version = "0.3", optional = true }
+ppv-lite86 = { package = "ppv-lite86", version = "0.2.14", default-features = false }
+cipher = { version = "0.3", optional = true }
 
 [dev-dependencies]
-hex-literal = "0.2"
+hex-literal = "0.3"
 
 [features]
-default = ["std", "simd", "rustcrypto_api"]
+default = ["std", "rustcrypto_api"]
 std = ["ppv-lite86/std"]
-rustcrypto_api = ["stream-cipher", "byteorder"]
-simd = ["ppv-lite86/simd"]
+rustcrypto_api = ["cipher"]
+no_simd = ["ppv-lite86/no_simd"]
+simd = [] # deprecated
 
 [badges]
 travis-ci = { repository = "cryptocorrosion/cryptocorrosion" }
diff --git a/stream-ciphers/chacha/benches/chacha20.rs b/stream-ciphers/chacha/benches/chacha20.rs
index b8d7c228..df70c1db 100644
--- a/stream-ciphers/chacha/benches/chacha20.rs
+++ b/stream-ciphers/chacha/benches/chacha20.rs
@@ -18,3 +18,15 @@ pub fn stream_10k(b: &mut Bencher) {
     });
     b.bytes = 10240;
 }
+
+#[bench]
+pub fn stream_narrow_10k(b: &mut Bencher) {
+    let mut state = ChaCha20::new_var(&[0; 32], &[0; 8]).unwrap();
+    let mut result = [0; 192];
+    b.iter(|| {
+        for _ in 0..10 {
+            state.apply_keystream(&mut result)
+        }
+    });
+    b.bytes = 1920;
+}
diff --git a/stream-ciphers/chacha/src/guts.rs b/stream-ciphers/chacha/src/guts.rs
index 394aab48..cf0dd000 100644
--- a/stream-ciphers/chacha/src/guts.rs
+++ b/stream-ciphers/chacha/src/guts.rs
@@ -1,8 +1,10 @@
 #[cfg(feature = "rustcrypto_api")]
-pub use stream_cipher::generic_array;
+pub use cipher::generic_array;
 
 pub use ppv_lite86::Machine;
-use ppv_lite86::{vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4};
+use ppv_lite86::{
+    vec128_storage, ArithOps, BitOps32, LaneWords4, MultiLane, StoreBytes, Vec4, Vec4Ext,
+};
 
 pub(crate) const BLOCK: usize = 64;
 pub(crate) const BLOCK64: u64 = BLOCK as u64;
@@ -11,14 +13,16 @@ const BUFBLOCKS: u64 = 1 << LOG2_BUFBLOCKS;
 pub(crate) const BUFSZ64: u64 = BLOCK64 * BUFBLOCKS;
 pub(crate) const BUFSZ: usize = BUFSZ64 as usize;
 
-#[derive(Clone)]
+/// Parameters of a ChaCha stream, including fixed parameters and current position.
+#[derive(Clone, PartialEq, Eq)]
 pub struct ChaCha {
     pub(crate) b: vec128_storage,
     pub(crate) c: vec128_storage,
     pub(crate) d: vec128_storage,
 }
 
-#[derive(Clone)]
+/// Working state of a ChaCha stream.
+#[derive(Clone, PartialEq, Eq)]
 pub struct State<V> {
     pub(crate) a: V,
     pub(crate) b: V,
@@ -41,23 +45,56 @@ pub(crate) fn round<V: ArithOps + BitOps32>(mut x: State<V>) -> State<V> {
 
 #[inline(always)]
 pub(crate) fn diagonalize<V: LaneWords4>(mut x: State<V>) -> State<V> {
-    x.b = x.b.shuffle_lane_words3012();
-    x.c = x.c.shuffle_lane_words2301();
-    x.d = x.d.shuffle_lane_words1230();
+    // Since b has the critical data dependency, avoid rotating b to hide latency.
+    //
+    // The order of these statements is important for performance on pre-AVX2 Intel machines, which
+    // are throughput-bound and operating near their superscalar limits during refill_wide. The
+    // permutations here and in undiagonalize have been found in testing on Nehalem to be optimal.
+    x.a = x.a.shuffle_lane_words1230();
+    x.c = x.c.shuffle_lane_words3012();
+    x.d = x.d.shuffle_lane_words2301();
     x
 }
+
 #[inline(always)]
 pub(crate) fn undiagonalize<V: LaneWords4>(mut x: State<V>) -> State<V> {
-    x.b = x.b.shuffle_lane_words1230();
-    x.c = x.c.shuffle_lane_words2301();
-    x.d = x.d.shuffle_lane_words3012();
+    // The order of these statements is magic. See comment in diagonalize.
+    x.c = x.c.shuffle_lane_words1230();
+    x.d = x.d.shuffle_lane_words2301();
+    x.a = x.a.shuffle_lane_words3012();
     x
 }
 
 impl ChaCha {
-    #[inline(always)]
     pub fn new(key: &[u8; 32], nonce: &[u8]) -> Self {
-        init_chacha(key, nonce)
+        let ctr_nonce = [
+            0,
+            if nonce.len() == 12 {
+                read_u32le(&nonce[0..4])
+            } else {
+                0
+            },
+            read_u32le(&nonce[nonce.len() - 8..nonce.len() - 4]),
+            read_u32le(&nonce[nonce.len() - 4..]),
+        ];
+        let key0 = [
+            read_u32le(&key[0..4]),
+            read_u32le(&key[4..8]),
+            read_u32le(&key[8..12]),
+            read_u32le(&key[12..16]),
+        ];
+        let key1 = [
+            read_u32le(&key[16..20]),
+            read_u32le(&key[20..24]),
+            read_u32le(&key[24..28]),
+            read_u32le(&key[28..32]),
+        ];
+
+        ChaCha {
+            b: key0.into(),
+            c: key1.into(),
+            d: ctr_nonce.into(),
+        }
     }
 
     #[inline(always)]
@@ -120,17 +157,86 @@ impl ChaCha {
         refill_narrow_rounds(self, drounds)
     }
 
-    #[inline(always)]
+    #[inline]
     pub fn set_stream_param(&mut self, param: u32, value: u64) {
-        set_stream_param(self, param, value)
+        let mut d: [u32; 4] = self.d.into();
+        let p0 = ((param << 1) | 1) as usize;
+        let p1 = (param << 1) as usize;
+        d[p0] = (value >> 32) as u32;
+        d[p1] = value as u32;
+        self.d = d.into();
     }
 
-    #[inline(always)]
+    #[inline]
     pub fn get_stream_param(&self, param: u32) -> u64 {
-        get_stream_param(self, param)
+        let d: [u32; 4] = self.d.into();
+        let p0 = ((param << 1) | 1) as usize;
+        let p1 = (param << 1) as usize;
+        ((d[p0] as u64) << 32) | d[p1] as u64
+    }
+
+    /// Return whether rhs represents the same stream, irrespective of current 32-bit position.
+    #[inline]
+    pub fn stream32_eq(&self, rhs: &Self) -> bool {
+        let self_d: [u32; 4] = self.d.into();
+        let rhs_d: [u32; 4] = rhs.d.into();
+        self.b == rhs.b
+            && self.c == rhs.c
+            && self_d[3] == rhs_d[3]
+            && self_d[2] == rhs_d[2]
+            && self_d[1] == rhs_d[1]
+    }
+
+    /// Return whether rhs represents the same stream, irrespective of current 64-bit position.
+    #[inline]
+    pub fn stream64_eq(&self, rhs: &Self) -> bool {
+        let self_d: [u32; 4] = self.d.into();
+        let rhs_d: [u32; 4] = rhs.d.into();
+        self.b == rhs.b && self.c == rhs.c && self_d[3] == rhs_d[3] && self_d[2] == rhs_d[2]
     }
 }
 
+// This implementation is platform-independent.
+#[inline(always)]
+#[cfg(target_endian = "big")]
+fn add_pos<Mach: Machine>(_m: Mach, d0: Mach::u32x4, i: u64) -> Mach::u32x4 {
+    let pos0 = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
+    let pos = pos0.wrapping_add(i);
+    d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0)
+}
+#[inline(always)]
+#[cfg(target_endian = "big")]
+fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
+    let d0: Mach::u32x4 = m.unpack(d);
+    let mut pos = ((d0.extract(1) as u64) << 32) | d0.extract(0) as u64;
+    pos = pos.wrapping_add(1);
+    let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
+    pos = pos.wrapping_add(1);
+    let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
+    pos = pos.wrapping_add(1);
+    let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
+    Mach::u32x4x4::from_lanes([d0, d1, d2, d3])
+}
+
+// Pos is packed into the state vectors as a little-endian u64,
+// so on LE platforms we can use native vector ops to increment it.
+#[inline(always)]
+#[cfg(target_endian = "little")]
+fn add_pos<Mach: Machine>(m: Mach, d: Mach::u32x4, i: u64) -> Mach::u32x4 {
+    let d0: Mach::u64x2 = m.unpack(d.into());
+    let incr = m.vec([i, 0]);
+    m.unpack((d0 + incr).into())
+}
+#[inline(always)]
+#[cfg(target_endian = "little")]
+fn d0123<Mach: Machine>(m: Mach, d: vec128_storage) -> Mach::u32x4x4 {
+    let d0: Mach::u64x2 = m.unpack(d);
+    let incr =
+        Mach::u64x2x4::from_lanes([m.vec([0, 0]), m.vec([1, 0]), m.vec([2, 0]), m.vec([3, 0])]);
+    m.unpack((Mach::u64x2x4::from_lanes([d0, d0, d0, d0]) + incr).into())
+}
+
+#[allow(clippy::many_single_char_names)]
 #[inline(always)]
 fn refill_wide_impl<Mach: Machine>(
     m: Mach,
@@ -139,55 +245,30 @@ fn refill_wide_impl<Mach: Machine>(
     out: &mut [u8; BUFSZ],
 ) {
     let k = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
-    let mut pos = state.pos64(m);
-    let d0: Mach::u32x4 = m.unpack(state.d);
-    pos += 1;
-    let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-    pos += 1;
-    let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-    pos += 1;
-    let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-
     let b = m.unpack(state.b);
     let c = m.unpack(state.c);
     let mut x = State {
         a: Mach::u32x4x4::from_lanes([k, k, k, k]),
         b: Mach::u32x4x4::from_lanes([b, b, b, b]),
         c: Mach::u32x4x4::from_lanes([c, c, c, c]),
-        d: m.unpack(Mach::u32x4x4::from_lanes([d0, d1, d2, d3]).into()),
+        d: d0123(m, state.d),
     };
     for _ in 0..drounds {
         x = round(x);
         x = undiagonalize(round(diagonalize(x)));
     }
-    let mut pos = state.pos64(m);
-    let d0: Mach::u32x4 = m.unpack(state.d);
-    pos += 1;
-    let d1 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-    pos += 1;
-    let d2 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-    pos += 1;
-    let d3 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-    pos += 1;
-    let d4 = d0.insert((pos >> 32) as u32, 1).insert(pos as u32, 0);
-
-    let (a, b, c, d) = (
-        x.a.to_lanes(),
-        x.b.to_lanes(),
-        x.c.to_lanes(),
-        x.d.to_lanes(),
-    );
+    let kk = Mach::u32x4x4::from_lanes([k, k, k, k]);
     let sb = m.unpack(state.b);
+    let sb = Mach::u32x4x4::from_lanes([sb, sb, sb, sb]);
     let sc = m.unpack(state.c);
-    let sd = [m.unpack(state.d), d1, d2, d3];
-    state.d = d4.into();
-    let mut words = out.chunks_exact_mut(16);
-    for ((((&a, &b), &c), &d), &sd) in a.iter().zip(&b).zip(&c).zip(&d).zip(&sd) {
-        (a + k).write_le(words.next().unwrap());
-        (b + sb).write_le(words.next().unwrap());
-        (c + sc).write_le(words.next().unwrap());
-        (d + sd).write_le(words.next().unwrap());
-    }
+    let sc = Mach::u32x4x4::from_lanes([sc, sc, sc, sc]);
+    let sd = d0123(m, state.d);
+    let results = Mach::u32x4x4::transpose4(x.a + kk, x.b + sb, x.c + sc, x.d + sd);
+    results.0.write_le(&mut out[0..64]);
+    results.1.write_le(&mut out[64..128]);
+    results.2.write_le(&mut out[128..192]);
+    results.3.write_le(&mut out[192..256]);
+    state.d = add_pos(m, sd.to_lanes()[0], 4).into();
 }
 
 dispatch!(m, Mach, {
@@ -196,7 +277,7 @@ dispatch!(m, Mach, {
     }
 });
 
-/// Refill the buffer from a single-block round, updating the block count.
+// Refill the buffer from a single-block round, updating the block count.
 dispatch_light128!(m, Mach, {
     fn refill_narrow(state: &mut ChaCha, drounds: u32, out: &mut [u8; BLOCK]) {
         let x = refill_narrow_rounds(state, drounds);
@@ -211,8 +292,8 @@ dispatch_light128!(m, Mach, {
     }
 });
 
-/// Single-block, rounds-only; shared by try_apply_keystream for tails shorter than BUFSZ
-/// and XChaCha's setup step.
+// Single-block, rounds-only; shared by try_apply_keystream for tails shorter than BUFSZ
+// and XChaCha's setup step.
 dispatch!(m, Mach, {
     fn refill_narrow_rounds(state: &mut ChaCha, drounds: u32) -> State<vec128_storage> {
         let k: Mach::u32x4 = m.vec([0x6170_7865, 0x3320_646e, 0x7962_2d32, 0x6b20_6574]);
@@ -235,50 +316,11 @@ dispatch!(m, Mach, {
     }
 });
 
-dispatch_light128!(m, Mach, {
-    fn set_stream_param(state: &mut ChaCha, param: u32, value: u64) {
-        let d: Mach::u32x4 = m.unpack(state.d);
-        state.d = d
-            .insert((value >> 32) as u32, (param << 1) | 1)
-            .insert(value as u32, param << 1)
-            .into();
-    }
-});
-
-dispatch_light128!(m, Mach, {
-    fn get_stream_param(state: &ChaCha, param: u32) -> u64 {
-        let d: Mach::u32x4 = m.unpack(state.d);
-        ((d.extract((param << 1) | 1) as u64) << 32) | d.extract(param << 1) as u64
-    }
-});
-
 fn read_u32le(xs: &[u8]) -> u32 {
     assert_eq!(xs.len(), 4);
     u32::from(xs[0]) | (u32::from(xs[1]) << 8) | (u32::from(xs[2]) << 16) | (u32::from(xs[3]) << 24)
 }
 
-dispatch_light128!(m, Mach, {
-    fn init_chacha(key: &[u8; 32], nonce: &[u8]) -> ChaCha {
-        let ctr_nonce = [
-            0,
-            if nonce.len() == 12 {
-                read_u32le(&nonce[0..4])
-            } else {
-                0
-            },
-            read_u32le(&nonce[nonce.len() - 8..nonce.len() - 4]),
-            read_u32le(&nonce[nonce.len() - 4..]),
-        ];
-        let key0: Mach::u32x4 = m.read_le(&key[..16]);
-        let key1: Mach::u32x4 = m.read_le(&key[16..]);
-        ChaCha {
-            b: key0.into(),
-            c: key1.into(),
-            d: ctr_nonce.into(),
-        }
-    }
-});
-
 dispatch_light128!(m, Mach, {
     fn init_chacha_x(key: &[u8; 32], nonce: &[u8; 24], rounds: u32) -> ChaCha {
         let key0: Mach::u32x4 = m.read_le(&key[..16]);
@@ -297,3 +339,25 @@ dispatch_light128!(m, Mach, {
         state
     }
 });
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// Basic check that streamXX_eq is block-count invariant
+    #[test]
+    fn test_stream_eq() {
+        let key = hex!("fa44478c59ca70538e3549096ce8b523232c50d9e8e8d10c203ef6c8d07098a5");
+        let nonce = hex!("8d3a0d6d7827c00701020304");
+        let mut a = ChaCha::new(&key, &nonce);
+        let b = a.clone();
+        let mut out = [0u8; BLOCK];
+        assert!(a == b);
+        assert!(a.stream32_eq(&b));
+        assert!(a.stream64_eq(&b));
+        a.refill(0, &mut out);
+        assert!(a != b);
+        assert!(a.stream32_eq(&b));
+        assert!(a.stream64_eq(&b));
+    }
+}
diff --git a/stream-ciphers/chacha/src/lib.rs b/stream-ciphers/chacha/src/lib.rs
index 363a1a7b..f064a271 100644
--- a/stream-ciphers/chacha/src/lib.rs
+++ b/stream-ciphers/chacha/src/lib.rs
@@ -4,6 +4,8 @@
 //!
 //! Stream-cipher usage:
 //! ```
+//! #[cfg(features = "std")]
+//! fn demo() {
 //! extern crate c2_chacha;
 //!
 //! use c2_chacha::stream_cipher::{NewStreamCipher, SyncStreamCipher, SyncStreamCipherSeek};
@@ -26,6 +28,7 @@
 //! for chunk in buffer.chunks_mut(3) {
 //!     cipher.apply_keystream(chunk);
 //! }
+//! }
 //! ```
 
 #![cfg_attr(not(feature = "std"), no_std)]
@@ -42,4 +45,6 @@ pub mod guts;
 #[cfg(feature = "rustcrypto_api")]
 mod rustcrypto_impl;
 #[cfg(feature = "rustcrypto_api")]
-pub use self::rustcrypto_impl::{stream_cipher, ChaCha12, ChaCha20, ChaCha8, Ietf, XChaCha20};
+pub use self::rustcrypto_impl::{
+    ChaCha12, ChaCha20, ChaCha8, Ietf, XChaCha12, XChaCha20, XChaCha8,
+};
diff --git a/stream-ciphers/chacha/src/rustcrypto_impl.rs b/stream-ciphers/chacha/src/rustcrypto_impl.rs
index ce74a63f..2e4e2188 100644
--- a/stream-ciphers/chacha/src/rustcrypto_impl.rs
+++ b/stream-ciphers/chacha/src/rustcrypto_impl.rs
@@ -1,10 +1,10 @@
-use byteorder::{ByteOrder, LE};
-use core::cmp;
 use crate::guts::generic_array::typenum::{Unsigned, U10, U12, U24, U32, U4, U6, U8};
 use crate::guts::generic_array::{ArrayLength, GenericArray};
 use crate::guts::{ChaCha, Machine, BLOCK, BLOCK64, BUFSZ};
-pub use stream_cipher;
-use stream_cipher::{LoopError, NewStreamCipher, SyncStreamCipher, SyncStreamCipherSeek};
+use cipher::errors::{LoopError, OverflowError};
+use cipher::{NewCipher, SeekNum, StreamCipher, StreamCipherSeek};
+use core::cmp;
+use core::convert::TryInto;
 
 const BIG_LEN: u64 = 0;
 const SMALL_LEN: u64 = 1 << 32;
@@ -176,7 +176,7 @@ impl<NonceSize, Rounds: Unsigned, IsX> ChaChaAny<NonceSize, Rounds, IsX> {
     }
 }
 
-impl<NonceSize, Rounds> NewStreamCipher for ChaChaAny<NonceSize, Rounds, O>
+impl<NonceSize, Rounds> NewCipher for ChaChaAny<NonceSize, Rounds, O>
 where
     NonceSize: Unsigned + ArrayLength<u8> + Default,
     Rounds: Default,
@@ -192,7 +192,7 @@ where
     }
 }
 
-impl<Rounds: Unsigned + Default> NewStreamCipher for ChaChaAny<U24, Rounds, X> {
+impl<Rounds: Unsigned + Default> NewCipher for ChaChaAny<U24, Rounds, X> {
     type KeySize = U32;
     type NonceSize = U24;
     #[inline]
@@ -204,18 +204,20 @@ impl<Rounds: Unsigned + Default> NewStreamCipher for ChaChaAny<U24, Rounds, X> {
     }
 }
 
-impl<NonceSize: Unsigned, Rounds, IsX> SyncStreamCipherSeek for ChaChaAny<NonceSize, Rounds, IsX> {
+impl<NonceSize: Unsigned, Rounds, IsX> StreamCipherSeek for ChaChaAny<NonceSize, Rounds, IsX> {
     #[inline]
-    fn current_pos(&self) -> u64 {
+    fn try_current_pos<T: SeekNum>(&self) -> Result<T, OverflowError> {
         unimplemented!()
     }
     #[inline(always)]
-    fn seek(&mut self, ct: u64) {
-        Self::seek(self, ct)
+    fn try_seek<T: SeekNum>(&mut self, pos: T) -> Result<(), LoopError> {
+        pos.try_into()
+            .map_err(|_| LoopError)
+            .map(|ct| Self::seek(self, ct))
     }
 }
 
-impl<NonceSize, Rounds: Unsigned, IsX> SyncStreamCipher for ChaChaAny<NonceSize, Rounds, IsX> {
+impl<NonceSize, Rounds: Unsigned, IsX> StreamCipher for ChaChaAny<NonceSize, Rounds, IsX> {
     #[inline]
     fn try_apply_keystream(&mut self, data: &mut [u8]) -> Result<(), LoopError> {
         Self::try_apply_keystream(self, data).map_err(|_| LoopError)
@@ -241,12 +243,12 @@ dispatch_light128!(m, Mach, {
         let ctr_nonce = [
             0,
             if nonce.len() == 12 {
-                LE::read_u32(&nonce[0..4])
+                u32::from_le_bytes(nonce[0..4].try_into().unwrap())
             } else {
                 0
             },
-            LE::read_u32(&nonce[nonce.len() - 8..nonce.len() - 4]),
-            LE::read_u32(&nonce[nonce.len() - 4..]),
+            u32::from_le_bytes(nonce[nonce.len() - 8..nonce.len() - 4].try_into().unwrap()),
+            u32::from_le_bytes(nonce[nonce.len() - 4..].try_into().unwrap()),
         ];
         let key0: Mach::u32x4 = m.read_le(&key[..16]);
         let key1: Mach::u32x4 = m.read_le(&key[16..]);
@@ -276,8 +278,8 @@ dispatch_light128!(m, Mach, {
         let ctr_nonce1 = [
             0,
             0,
-            LE::read_u32(&nonce[16..20]),
-            LE::read_u32(&nonce[20..24]),
+            u32::from_le_bytes(nonce[16..20].try_into().unwrap()),
+            u32::from_le_bytes(nonce[20..24].try_into().unwrap()),
         ];
         state.b = x.a;
         state.c = x.d;
@@ -288,12 +290,18 @@ dispatch_light128!(m, Mach, {
 
 /// IETF RFC 7539 ChaCha. Unsuitable for messages longer than 256 GiB.
 pub type Ietf = ChaChaAny<U12, U10, O>;
-/// ChaCha20, as used in several standards; from Bernstein's original publication.
-pub type ChaCha20 = ChaChaAny<U8, U10, O>;
-/// Similar to ChaCha20, but with fewer rounds for higher performance.
-pub type ChaCha12 = ChaChaAny<U8, U6, O>;
 /// Similar to ChaCha20, but with fewer rounds for higher performance.
 pub type ChaCha8 = ChaChaAny<U8, U4, O>;
+/// Similar to ChaCha20, but with fewer rounds for higher performance.
+pub type ChaCha12 = ChaChaAny<U8, U6, O>;
+/// ChaCha20, as used in several standards; from Bernstein's original publication.
+pub type ChaCha20 = ChaChaAny<U8, U10, O>;
+/// Constructed analogously to XChaCha20, but with fewer rounds for higher performance;
+/// mixes during initialization to support both a long nonce and a full-length (64-bit) block counter.
+pub type XChaCha8 = ChaChaAny<U24, U4, X>;
+/// Constructed analogously to XChaCha20, but with fewer rounds for higher performance;
+/// mixes during initialization to support both a long nonce and a full-length (64-bit) block counter.
+pub type XChaCha12 = ChaChaAny<U24, U6, X>;
 /// Constructed analogously to XSalsa20; mixes during initialization to support both a long nonce
 /// and a full-length (64-bit) block counter.
 pub type XChaCha20 = ChaChaAny<U24, U10, X>;
diff --git a/test_alt_simd.sh b/test_alt_simd.sh
deleted file mode 100755
index 52c05d81..00000000
--- a/test_alt_simd.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/sh
-
-if [ -n "$FAILFAST" ]; then set -e; fi
-
-# no SIMD yet:
-# - hashes/threefish
-# - block-ciphers/skein
-
-# not ported to crypto-simd API yet:
-# - hashes/groestl
-
-echo BACKEND ppv-null
-cd hashes/blake; cargo test --no-default-features; cd ../..
-cd hashes/jh; cargo test --no-default-features; cd ../..
-cd stream-ciphers/chacha; cargo test --no-default-features; cd ../..
-
-echo BACKEND packed_simd
-cd hashes/blake; cargo test --no-default-features --features packed_simd,std; cd ../..
-cd hashes/jh; cargo test -p jh-x86_64 --no-default-features --features packed_simd,std; cd ../..
-cd stream-ciphers/chacha; cargo test --no-default-features --features packed_simd,std; cd ../..
diff --git a/test_stable.sh b/test_stable.sh
index b0048dc9..fda2e9d7 100755
--- a/test_stable.sh
+++ b/test_stable.sh
@@ -1,4 +1,4 @@
 #!/bin/sh
 
+cargo +1.32.0-x86_64-unknown-linux-gnu test
 cargo +stable test
-cargo +1.31.1-x86_64-unknown-linux-gnu test
diff --git a/unreleased-changes.sh b/unreleased-changes.sh
new file mode 100755
index 00000000..e7d82105
--- /dev/null
+++ b/unreleased-changes.sh
@@ -0,0 +1,23 @@
+#!/bin/sh
+
+CRATE_TAGS=$(git tag | sed 's/-[^-]*$//' | uniq)
+CARGOS=$(echo */*/Cargo.toml)
+
+for TAG in $CRATE_TAGS; do
+	LATEST_TAG=$(git tag | grep $TAG | sort -V | tail -n1)
+	FOUND_CARGO=
+	for CARGO in $CARGOS; do
+		# fixups for tag names that don't match crate names
+		CRATE=$(echo $TAG | sed -e 's/^jh$/jh-x86_64/' -e 's/^blake$/blake-hash/' -e 's/^groestl$/groestl-aesni/')
+		if grep -q "name = \"$CRATE\"" $CARGO; then
+			FOUND_CARGO=1
+			DIR=$(dirname $CARGO)
+			git log --color=always --stat $LATEST_TAG..HEAD -- $DIR | less -R
+			break
+		fi
+	done
+	if [ -z "$FOUND_CARGO" ]; then
+		echo "Couldn't find a Cargo.toml for $TAG!"
+	fi
+done
+
diff --git a/utils-simd/crypto-simd/Cargo.toml b/utils-simd/crypto-simd/Cargo.toml
index 9b3858ef..b07c3e7c 100644
--- a/utils-simd/crypto-simd/Cargo.toml
+++ b/utils-simd/crypto-simd/Cargo.toml
@@ -2,12 +2,13 @@
 name = "crypto-simd"
 version = "0.2.0"
 authors = ["The CryptoCorrosion Contributors"]
-edition = "2018"
+edition = "2021"
 license = "MIT/Apache-2.0"
 description = "Crypto-oriented SIMD wrapper abstracting over multiple backends"
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
 keywords = ["crypto", "simd"]
 categories = ["cryptography", "no-std"]
+rust-version = "1.61"
 
 [dependencies]
 packed_simd_crate = { package = "packed_simd", version = "0.3", optional = true }
diff --git a/utils-simd/ppv-lite86/CHANGELOG.md b/utils-simd/ppv-lite86/CHANGELOG.md
new file mode 100644
index 00000000..6e34be39
--- /dev/null
+++ b/utils-simd/ppv-lite86/CHANGELOG.md
@@ -0,0 +1,10 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+
+## [0.2.16]
+### Added
+- add [u64; 4] conversion for generic vec256, to support BLAKE on non-x86.
+- impl `From` (rather than just `Into`) for conversions between `*_storage` types and arrays.
diff --git a/utils-simd/ppv-lite86/Cargo.toml b/utils-simd/ppv-lite86/Cargo.toml
index b1c0c0f5..feb8cc2c 100644
--- a/utils-simd/ppv-lite86/Cargo.toml
+++ b/utils-simd/ppv-lite86/Cargo.toml
@@ -1,20 +1,23 @@
 [package]
 name = "ppv-lite86"
-version = "0.2.6"
+version = "0.2.20"
 authors = ["The CryptoCorrosion Contributors"]
-edition = "2018"
+edition = "2021"
 license = "MIT/Apache-2.0"
 description = "Implementation of the crypto-simd API for x86"
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
 keywords = ["crypto", "simd", "x86"]
 categories = ["cryptography", "no-std"]
+rust-version = "1.61"
 
 [dependencies]
+zerocopy = { version = "0.7", features = ["simd", "derive"] }
 
 [badges]
 travis-ci = { repository = "cryptocorrosion/cryptocorrosion" }
 
 [features]
-default = ["std", "simd"]
+default = ["std"]
 std = []
-simd = []
+simd = [] # deprecated
+no_simd = []
diff --git a/utils-simd/ppv-lite86/src/generic.rs b/utils-simd/ppv-lite86/src/generic.rs
index 2d0a74cf..8989482a 100644
--- a/utils-simd/ppv-lite86/src/generic.rs
+++ b/utils-simd/ppv-lite86/src/generic.rs
@@ -1,22 +1,54 @@
 #![allow(non_camel_case_types)]
 
-use core::ops::*;
 use crate::soft::{x2, x4};
 use crate::types::*;
+use core::ops::*;
+use zerocopy::{AsBytes, FromBytes, FromZeroes};
 
-#[derive(Clone, Copy)]
+#[repr(C)]
+#[derive(Clone, Copy, FromBytes, AsBytes, FromZeroes)]
 pub union vec128_storage {
     d: [u32; 4],
     q: [u64; 2],
-    o: [u128; 1],
 }
 impl From<[u32; 4]> for vec128_storage {
-    #[inline]
+    #[inline(always)]
     fn from(d: [u32; 4]) -> Self {
         Self { d }
     }
 }
-#[derive(Clone, Copy)]
+impl From<vec128_storage> for [u32; 4] {
+    #[inline(always)]
+    fn from(d: vec128_storage) -> Self {
+        unsafe { d.d }
+    }
+}
+impl From<[u64; 2]> for vec128_storage {
+    #[inline(always)]
+    fn from(q: [u64; 2]) -> Self {
+        Self { q }
+    }
+}
+impl From<vec128_storage> for [u64; 2] {
+    #[inline(always)]
+    fn from(q: vec128_storage) -> Self {
+        unsafe { q.q }
+    }
+}
+impl Default for vec128_storage {
+    #[inline(always)]
+    fn default() -> Self {
+        Self { q: [0, 0] }
+    }
+}
+impl Eq for vec128_storage {}
+impl PartialEq<vec128_storage> for vec128_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.q == rhs.q }
+    }
+}
+#[derive(Clone, Copy, PartialEq, Eq, Default)]
 pub struct vec256_storage {
     v128: [vec128_storage; 2],
 }
@@ -30,7 +62,23 @@ impl vec256_storage {
         self.v128
     }
 }
-#[derive(Clone, Copy)]
+impl From<vec256_storage> for [u64; 4] {
+    #[inline(always)]
+    fn from(q: vec256_storage) -> Self {
+        let [a, b]: [u64; 2] = q.v128[0].into();
+        let [c, d]: [u64; 2] = q.v128[1].into();
+        [a, b, c, d]
+    }
+}
+impl From<[u64; 4]> for vec256_storage {
+    #[inline(always)]
+    fn from([a, b, c, d]: [u64; 4]) -> Self {
+        Self {
+            v128: [[a, b].into(), [c, d].into()],
+        }
+    }
+}
+#[derive(Clone, Copy, PartialEq, Eq, Default)]
 pub struct vec512_storage {
     v128: [vec128_storage; 4],
 }
@@ -45,6 +93,7 @@ impl vec512_storage {
     }
 }
 
+#[inline(always)]
 fn dmap<T, F>(t: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -78,6 +127,7 @@ where
     unsafe { T::unpack(d) }
 }
 
+#[inline(always)]
 fn qmap<T, F>(t: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -91,6 +141,7 @@ where
     unsafe { T::unpack(q) }
 }
 
+#[inline(always)]
 fn qmap2<T, F>(a: T, b: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -106,17 +157,29 @@ where
     unsafe { T::unpack(q) }
 }
 
+#[inline(always)]
+fn o_of_q(q: [u64; 2]) -> u128 {
+    u128::from(q[0]) | (u128::from(q[1]) << 64)
+}
+
+#[inline(always)]
+fn q_of_o(o: u128) -> [u64; 2] {
+    [o as u64, (o >> 64) as u64]
+}
+
+#[inline(always)]
 fn omap<T, F>(a: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
     F: Fn(u128) -> u128,
 {
     let a: vec128_storage = a.into();
-    let ao = unsafe { a.o };
-    let o = vec128_storage { o: [f(ao[0])] };
+    let ao = o_of_q(unsafe { a.q });
+    let o = vec128_storage { q: q_of_o(f(ao)) };
     unsafe { T::unpack(o) }
 }
 
+#[inline(always)]
 fn omap2<T, F>(a: T, b: T, f: F) -> T
 where
     T: Store<vec128_storage> + Into<vec128_storage>,
@@ -124,10 +187,10 @@ where
 {
     let a: vec128_storage = a.into();
     let b: vec128_storage = b.into();
-    let ao = unsafe { a.o };
-    let bo = unsafe { b.o };
+    let ao = o_of_q(unsafe { a.q });
+    let bo = o_of_q(unsafe { b.q });
     let o = vec128_storage {
-        o: [f(ao[0], bo[0])],
+        q: q_of_o(f(ao, bo)),
     };
     unsafe { T::unpack(o) }
 }
@@ -200,39 +263,39 @@ macro_rules! impl_bitops {
         }
 
         impl Swap64 for $vec {
-            #[inline]
+            #[inline(always)]
             fn swap1(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x5555555555555555) << 1) | ((x & 0xaaaaaaaaaaaaaaaa) >> 1)
                 })
             }
-            #[inline]
+            #[inline(always)]
             fn swap2(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x3333333333333333) << 2) | ((x & 0xcccccccccccccccc) >> 2)
                 })
             }
-            #[inline]
+            #[inline(always)]
             fn swap4(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x0f0f0f0f0f0f0f0f) << 4) | ((x & 0xf0f0f0f0f0f0f0f0) >> 4)
                 })
             }
-            #[inline]
+            #[inline(always)]
             fn swap8(self) -> Self {
                 qmap(self, |x| {
                     ((x & 0x00ff00ff00ff00ff) << 8) | ((x & 0xff00ff00ff00ff00) >> 8)
                 })
             }
-            #[inline]
+            #[inline(always)]
             fn swap16(self) -> Self {
                 dmap(self, |x| x.rotate_left(16))
             }
-            #[inline]
+            #[inline(always)]
             fn swap32(self) -> Self {
                 qmap(self, |x| x.rotate_left(32))
             }
-            #[inline]
+            #[inline(always)]
             fn swap64(self) -> Self {
                 omap(self, |x| (x << 64) | (x >> 64))
             }
@@ -244,82 +307,83 @@ impl_bitops!(u64x2_generic);
 impl_bitops!(u128x1_generic);
 
 impl RotateEachWord32 for u32x4_generic {
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right7(self) -> Self {
         dmap(self, |x| x.rotate_right(7))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right8(self) -> Self {
         dmap(self, |x| x.rotate_right(8))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right11(self) -> Self {
         dmap(self, |x| x.rotate_right(11))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right12(self) -> Self {
         dmap(self, |x| x.rotate_right(12))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right16(self) -> Self {
         dmap(self, |x| x.rotate_right(16))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right20(self) -> Self {
         dmap(self, |x| x.rotate_right(20))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right24(self) -> Self {
         dmap(self, |x| x.rotate_right(24))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right25(self) -> Self {
         dmap(self, |x| x.rotate_right(25))
     }
 }
 
 impl RotateEachWord32 for u64x2_generic {
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right7(self) -> Self {
         qmap(self, |x| x.rotate_right(7))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right8(self) -> Self {
         qmap(self, |x| x.rotate_right(8))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right11(self) -> Self {
         qmap(self, |x| x.rotate_right(11))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right12(self) -> Self {
         qmap(self, |x| x.rotate_right(12))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right16(self) -> Self {
         qmap(self, |x| x.rotate_right(16))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right20(self) -> Self {
         qmap(self, |x| x.rotate_right(20))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right24(self) -> Self {
         qmap(self, |x| x.rotate_right(24))
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right25(self) -> Self {
         qmap(self, |x| x.rotate_right(25))
     }
 }
 impl RotateEachWord64 for u64x2_generic {
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right32(self) -> Self {
         qmap(self, |x| x.rotate_right(32))
     }
 }
 
 // workaround for koute/cargo-web#52 (u128::rotate_* broken with cargo web)
+#[inline(always)]
 fn rotate_u128_right(x: u128, i: u32) -> u128 {
     (x >> i) | (x << (128 - i))
 }
@@ -330,41 +394,41 @@ fn test_rotate_u128() {
 }
 
 impl RotateEachWord32 for u128x1_generic {
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right7(self) -> Self {
         Self([rotate_u128_right(self.0[0], 7)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right8(self) -> Self {
         Self([rotate_u128_right(self.0[0], 8)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right11(self) -> Self {
         Self([rotate_u128_right(self.0[0], 11)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right12(self) -> Self {
         Self([rotate_u128_right(self.0[0], 12)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right16(self) -> Self {
         Self([rotate_u128_right(self.0[0], 16)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right20(self) -> Self {
         Self([rotate_u128_right(self.0[0], 20)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right24(self) -> Self {
         Self([rotate_u128_right(self.0[0], 24)])
     }
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right25(self) -> Self {
         Self([rotate_u128_right(self.0[0], 25)])
     }
 }
 impl RotateEachWord64 for u128x1_generic {
-    #[inline]
+    #[inline(always)]
     fn rotate_each_word_right32(self) -> Self {
         Self([rotate_u128_right(self.0[0], 32)])
     }
@@ -383,17 +447,20 @@ impl Machine for GenericMachine {
     type u32x4x4 = u32x4x4_generic;
     type u64x2x4 = u64x2x4_generic;
     type u128x4 = u128x4_generic;
-    #[inline]
+    #[inline(always)]
     unsafe fn instance() -> Self {
         Self
     }
 }
 
-#[derive(Copy, Clone, Debug, PartialEq)]
+#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)]
+#[repr(transparent)]
 pub struct u32x4_generic([u32; 4]);
-#[derive(Copy, Clone, Debug, PartialEq)]
+#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)]
+#[repr(transparent)]
 pub struct u64x2_generic([u64; 2]);
-#[derive(Copy, Clone, Debug, PartialEq)]
+#[derive(Copy, Clone, Debug, PartialEq, FromBytes, AsBytes, FromZeroes)]
+#[repr(transparent)]
 pub struct u128x1_generic([u128; 1]);
 
 impl From<u32x4_generic> for vec128_storage {
@@ -411,7 +478,7 @@ impl From<u64x2_generic> for vec128_storage {
 impl From<u128x1_generic> for vec128_storage {
     #[inline(always)]
     fn from(o: u128x1_generic) -> Self {
-        Self { o: o.0 }
+        Self { q: q_of_o(o.0[0]) }
     }
 }
 
@@ -430,7 +497,7 @@ impl Store<vec128_storage> for u64x2_generic {
 impl Store<vec128_storage> for u128x1_generic {
     #[inline(always)]
     unsafe fn unpack(s: vec128_storage) -> Self {
-        Self(s.o)
+        Self([o_of_q(s.q); 1])
     }
 }
 
@@ -498,53 +565,45 @@ impl BSwap for u128x1_generic {
 impl StoreBytes for u32x4_generic {
     #[inline(always)]
     unsafe fn unsafe_read_le(input: &[u8]) -> Self {
-        assert_eq!(input.len(), 16);
-        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        let x = u32x4_generic::read_from(input).unwrap();
         dmap(x, |x| x.to_le())
     }
     #[inline(always)]
     unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-        assert_eq!(input.len(), 16);
-        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        let x = u32x4_generic::read_from(input).unwrap();
         dmap(x, |x| x.to_be())
     }
     #[inline(always)]
     fn write_le(self, out: &mut [u8]) {
-        assert_eq!(out.len(), 16);
         let x = dmap(self, |x| x.to_le());
-        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+        x.write_to(out).unwrap();
     }
     #[inline(always)]
     fn write_be(self, out: &mut [u8]) {
-        assert_eq!(out.len(), 16);
         let x = dmap(self, |x| x.to_be());
-        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+        x.write_to(out).unwrap();
     }
 }
 impl StoreBytes for u64x2_generic {
     #[inline(always)]
     unsafe fn unsafe_read_le(input: &[u8]) -> Self {
-        assert_eq!(input.len(), 16);
-        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        let x = u64x2_generic::read_from(input).unwrap();
         qmap(x, |x| x.to_le())
     }
     #[inline(always)]
     unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-        assert_eq!(input.len(), 16);
-        let x = core::mem::transmute(core::ptr::read(input as *const _ as *const [u8; 16]));
+        let x = u64x2_generic::read_from(input).unwrap();
         qmap(x, |x| x.to_be())
     }
     #[inline(always)]
     fn write_le(self, out: &mut [u8]) {
-        assert_eq!(out.len(), 16);
         let x = qmap(self, |x| x.to_le());
-        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+        x.write_to(out).unwrap();
     }
     #[inline(always)]
     fn write_be(self, out: &mut [u8]) {
-        assert_eq!(out.len(), 16);
         let x = qmap(self, |x| x.to_be());
-        unsafe { core::ptr::write(out as *mut _ as *mut [u8; 16], core::mem::transmute(x)) }
+        x.write_to(out).unwrap();
     }
 }
 
@@ -560,6 +619,22 @@ pub type u32x4x4_generic = x4<u32x4_generic>;
 pub type u64x2x4_generic = x4<u64x2_generic>;
 pub type u128x4_generic = x4<u128x1_generic>;
 
+impl Vector<[u32; 16]> for u32x4x4_generic {
+    fn to_scalars(self) -> [u32; 16] {
+        let [a, b, c, d] = self.0;
+        let a = a.0;
+        let b = b.0;
+        let c = c.0;
+        let d = d.0;
+        [
+            a[0], a[1], a[2], a[3], //
+            b[0], b[1], b[2], b[3], //
+            c[0], c[1], c[2], c[3], //
+            d[0], d[1], d[2], d[3], //
+        ]
+    }
+}
+
 impl MultiLane<[u32; 4]> for u32x4_generic {
     #[inline(always)]
     fn to_lanes(self) -> [u32; 4] {
@@ -700,7 +775,7 @@ impl u128x4<GenericMachine> for u128x4_generic {}
 #[macro_export]
 macro_rules! dispatch {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline]
+        #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
@@ -717,7 +792,7 @@ macro_rules! dispatch {
 #[macro_export]
 macro_rules! dispatch_light128 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline]
+        #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
@@ -734,7 +809,7 @@ macro_rules! dispatch_light128 {
 #[macro_export]
 macro_rules! dispatch_light256 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline]
+        #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
@@ -751,7 +826,7 @@ macro_rules! dispatch_light256 {
 #[macro_export]
 macro_rules! dispatch_light512 {
     ($mach:ident, $MTy:ident, { $([$pub:tt$(($krate:tt))*])* fn $name:ident($($arg:ident: $argty:ty),* $(,)*) -> $ret:ty $body:block }) => {
-        #[inline]
+        #[inline(always)]
         $($pub$(($krate))*)* fn $name($($arg: $argty),*) -> $ret {
             let $mach = unsafe { $crate::generic::GenericMachine::instance() };
             #[inline(always)]
diff --git a/utils-simd/ppv-lite86/src/lib.rs b/utils-simd/ppv-lite86/src/lib.rs
index 43dc5d86..311df97b 100644
--- a/utils-simd/ppv-lite86/src/lib.rs
+++ b/utils-simd/ppv-lite86/src/lib.rs
@@ -9,14 +9,34 @@ mod soft;
 mod types;
 pub use self::types::*;
 
-#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))]
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "sse2",
+    not(feature = "no_simd"),
+    not(miri)
+))]
 pub mod x86_64;
-#[cfg(all(feature = "simd", target_arch = "x86_64", not(miri)))]
+#[cfg(all(
+    target_arch = "x86_64",
+    target_feature = "sse2",
+    not(feature = "no_simd"),
+    not(miri)
+))]
 use self::x86_64 as arch;
 
-#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))]
+#[cfg(any(
+    feature = "no_simd",
+    miri,
+    not(target_arch = "x86_64"),
+    all(target_arch = "x86_64", not(target_feature = "sse2"))
+))]
 pub mod generic;
-#[cfg(any(miri, not(all(feature = "simd", any(target_arch = "x86_64")))))]
+#[cfg(any(
+    feature = "no_simd",
+    miri,
+    not(target_arch = "x86_64"),
+    all(target_arch = "x86_64", not(target_feature = "sse2"))
+))]
 use self::generic as arch;
 
 pub use self::arch::{vec128_storage, vec256_storage, vec512_storage};
diff --git a/utils-simd/ppv-lite86/src/soft.rs b/utils-simd/ppv-lite86/src/soft.rs
index d12dac52..b2cf0e19 100644
--- a/utils-simd/ppv-lite86/src/soft.rs
+++ b/utils-simd/ppv-lite86/src/soft.rs
@@ -1,11 +1,13 @@
 //! Implement 256- and 512- bit in terms of 128-bit, for machines without native wide SIMD.
 
-use core::marker::PhantomData;
-use core::ops::*;
 use crate::types::*;
 use crate::{vec128_storage, vec256_storage, vec512_storage};
+use core::marker::PhantomData;
+use core::ops::*;
+use zerocopy::{AsBytes, FromBytes, FromZeroes};
 
-#[derive(Copy, Clone, Default)]
+#[derive(Copy, Clone, Default, FromBytes, AsBytes, FromZeroes)]
+#[repr(transparent)]
 #[allow(non_camel_case_types)]
 pub struct x2<W, G>(pub [W; 2], PhantomData<G>);
 impl<W, G> x2<W, G> {
@@ -175,28 +177,53 @@ impl<W: BSwap + Copy, G> BSwap for x2<W, G> {
 impl<W: StoreBytes + BSwap + Copy, G> StoreBytes for x2<W, G> {
     #[inline(always)]
     unsafe fn unsafe_read_le(input: &[u8]) -> Self {
-        let input = input.split_at(16);
+        let input = input.split_at(input.len() / 2);
         x2::new([W::unsafe_read_le(input.0), W::unsafe_read_le(input.1)])
     }
     #[inline(always)]
     unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-        x2::unsafe_read_le(input).bswap()
+        let input = input.split_at(input.len() / 2);
+        x2::new([W::unsafe_read_be(input.0), W::unsafe_read_be(input.1)])
     }
     #[inline(always)]
     fn write_le(self, out: &mut [u8]) {
-        let out = out.split_at_mut(16);
+        let out = out.split_at_mut(out.len() / 2);
         self.0[0].write_le(out.0);
         self.0[1].write_le(out.1);
     }
     #[inline(always)]
     fn write_be(self, out: &mut [u8]) {
-        let out = out.split_at_mut(16);
+        let out = out.split_at_mut(out.len() / 2);
         self.0[0].write_be(out.0);
         self.0[1].write_be(out.1);
     }
 }
+impl<W: Copy + LaneWords4, G: Copy> LaneWords4 for x2<W, G> {
+    #[inline(always)]
+    fn shuffle_lane_words2301(self) -> Self {
+        Self::new([
+            self.0[0].shuffle_lane_words2301(),
+            self.0[1].shuffle_lane_words2301(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words1230(self) -> Self {
+        Self::new([
+            self.0[0].shuffle_lane_words1230(),
+            self.0[1].shuffle_lane_words1230(),
+        ])
+    }
+    #[inline(always)]
+    fn shuffle_lane_words3012(self) -> Self {
+        Self::new([
+            self.0[0].shuffle_lane_words3012(),
+            self.0[1].shuffle_lane_words3012(),
+        ])
+    }
+}
 
-#[derive(Copy, Clone, Default)]
+#[derive(Copy, Clone, Default, FromBytes, AsBytes, FromZeroes)]
+#[repr(transparent)]
 #[allow(non_camel_case_types)]
 pub struct x4<W>(pub [W; 4]);
 impl<W> x4<W> {
@@ -238,7 +265,12 @@ macro_rules! fwd_unop_x4 {
     ($fn:ident) => {
         #[inline(always)]
         fn $fn(self) -> Self {
-            x4([self.0[0].$fn(), self.0[1].$fn(), self.0[2].$fn(), self.0[3].$fn()])
+            x4([
+                self.0[0].$fn(),
+                self.0[1].$fn(),
+                self.0[2].$fn(),
+                self.0[3].$fn(),
+            ])
         }
     };
 }
@@ -305,6 +337,20 @@ impl<W: Copy> Vec4<W> for x4<W> {
         self
     }
 }
+impl<W: Copy> Vec4Ext<W> for x4<W> {
+    #[inline(always)]
+    fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
+    where
+        Self: Sized,
+    {
+        (
+            x4([a.0[0], b.0[0], c.0[0], d.0[0]]),
+            x4([a.0[1], b.0[1], c.0[1], d.0[1]]),
+            x4([a.0[2], b.0[2], c.0[2], d.0[2]]),
+            x4([a.0[3], b.0[3], c.0[3], d.0[3]]),
+        )
+    }
+}
 impl<W: Copy + Store<vec128_storage>> Store<vec512_storage> for x4<W> {
     #[inline(always)]
     unsafe fn unpack(p: vec512_storage) -> Self {
@@ -363,30 +409,39 @@ impl<W: BSwap + Copy> BSwap for x4<W> {
 impl<W: StoreBytes + BSwap + Copy> StoreBytes for x4<W> {
     #[inline(always)]
     unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+        let n = input.len() / 4;
         x4([
-            W::unsafe_read_le(&input[0..16]),
-            W::unsafe_read_le(&input[16..32]),
-            W::unsafe_read_le(&input[32..48]),
-            W::unsafe_read_le(&input[48..64]),
+            W::unsafe_read_le(&input[..n]),
+            W::unsafe_read_le(&input[n..n * 2]),
+            W::unsafe_read_le(&input[n * 2..n * 3]),
+            W::unsafe_read_le(&input[n * 3..]),
         ])
     }
     #[inline(always)]
     unsafe fn unsafe_read_be(input: &[u8]) -> Self {
-        x4::unsafe_read_le(input).bswap()
+        let n = input.len() / 4;
+        x4([
+            W::unsafe_read_be(&input[..n]),
+            W::unsafe_read_be(&input[n..n * 2]),
+            W::unsafe_read_be(&input[n * 2..n * 3]),
+            W::unsafe_read_be(&input[n * 3..]),
+        ])
     }
     #[inline(always)]
     fn write_le(self, out: &mut [u8]) {
-        self.0[0].write_le(&mut out[0..16]);
-        self.0[1].write_le(&mut out[16..32]);
-        self.0[2].write_le(&mut out[32..48]);
-        self.0[3].write_le(&mut out[48..64]);
+        let n = out.len() / 4;
+        self.0[0].write_le(&mut out[..n]);
+        self.0[1].write_le(&mut out[n..n * 2]);
+        self.0[2].write_le(&mut out[n * 2..n * 3]);
+        self.0[3].write_le(&mut out[n * 3..]);
     }
     #[inline(always)]
     fn write_be(self, out: &mut [u8]) {
-        self.0[0].write_be(&mut out[0..16]);
-        self.0[1].write_be(&mut out[16..32]);
-        self.0[2].write_be(&mut out[32..48]);
-        self.0[3].write_be(&mut out[48..64]);
+        let n = out.len() / 4;
+        self.0[0].write_be(&mut out[..n]);
+        self.0[1].write_be(&mut out[n..n * 2]);
+        self.0[2].write_be(&mut out[n * 2..n * 3]);
+        self.0[3].write_be(&mut out[n * 3..]);
     }
 }
 impl<W: Copy + LaneWords4> LaneWords4 for x4<W> {
diff --git a/utils-simd/ppv-lite86/src/types.rs b/utils-simd/ppv-lite86/src/types.rs
index 119b6bb8..f9f3bf1c 100644
--- a/utils-simd/ppv-lite86/src/types.rs
+++ b/utils-simd/ppv-lite86/src/types.rs
@@ -1,3 +1,4 @@
+#![allow(non_camel_case_types)]
 use core::ops::{Add, AddAssign, BitAnd, BitOr, BitXor, BitXorAssign, Not};
 
 pub trait AndNot {
@@ -44,182 +45,188 @@ pub trait RotateEachWord64 {
 
 pub trait RotateEachWord128 {}
 
-#[allow(non_camel_case_types)]
-mod types {
-    //! Vector type naming scheme:
-    //! uN[xP]xL
-    //! Unsigned; N-bit words * P bits per lane * L lanes
-    //!
-    //! A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
-    //! wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
-    //! slow inter-lane operations.
+// Vector type naming scheme:
+// uN[xP]xL
+// Unsigned; N-bit words * P bits per lane * L lanes
+//
+// A lane is always 128-bits, chosen because common SIMD architectures treat 128-bit units of
+// wide vectors specially (supporting e.g. intra-lane shuffles), and tend to have limited and
+// slow inter-lane operations.
 
-    use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
-    use crate::{ArithOps, BitOps128, BitOps32, BitOps64, Machine, Store, StoreBytes};
+use crate::arch::{vec128_storage, vec256_storage, vec512_storage};
 
-    pub trait UnsafeFrom<T> {
-        unsafe fn unsafe_from(t: T) -> Self;
-    }
+#[allow(clippy::missing_safety_doc)]
+pub trait UnsafeFrom<T> {
+    unsafe fn unsafe_from(t: T) -> Self;
+}
 
-    /// A vector composed of two elements, which may be words or themselves vectors.
-    pub trait Vec2<W> {
-        fn extract(self, i: u32) -> W;
-        fn insert(self, w: W, i: u32) -> Self;
-    }
+/// A vector composed of two elements, which may be words or themselves vectors.
+pub trait Vec2<W> {
+    fn extract(self, i: u32) -> W;
+    fn insert(self, w: W, i: u32) -> Self;
+}
 
-    /// A vector composed of four elements, which may be words or themselves vectors.
-    pub trait Vec4<W> {
-        fn extract(self, i: u32) -> W;
-        fn insert(self, w: W, i: u32) -> Self;
-    }
+/// A vector composed of four elements, which may be words or themselves vectors.
+pub trait Vec4<W> {
+    fn extract(self, i: u32) -> W;
+    fn insert(self, w: W, i: u32) -> Self;
+}
+/// Vec4 functions which may not be implemented yet for all Vec4 types.
+/// NOTE: functions in this trait may be moved to Vec4 in any patch release. To avoid breakage,
+/// import Vec4Ext only together with Vec4, and don't qualify its methods.
+pub trait Vec4Ext<W> {
+    fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self)
+    where
+        Self: Sized;
+}
+pub trait Vector<T> {
+    fn to_scalars(self) -> T;
+}
 
-    // TODO: multiples of 4 should inherit this
-    /// A vector composed of four words; depending on their size, operations may cross lanes.
-    pub trait Words4 {
-        fn shuffle1230(self) -> Self;
-        fn shuffle2301(self) -> Self;
-        fn shuffle3012(self) -> Self;
-    }
+// TODO: multiples of 4 should inherit this
+/// A vector composed of four words; depending on their size, operations may cross lanes.
+pub trait Words4 {
+    fn shuffle1230(self) -> Self;
+    fn shuffle2301(self) -> Self;
+    fn shuffle3012(self) -> Self;
+}
 
-    /// A vector composed one or more lanes each composed of four words.
-    pub trait LaneWords4 {
-        fn shuffle_lane_words1230(self) -> Self;
-        fn shuffle_lane_words2301(self) -> Self;
-        fn shuffle_lane_words3012(self) -> Self;
-    }
+/// A vector composed one or more lanes each composed of four words.
+pub trait LaneWords4 {
+    fn shuffle_lane_words1230(self) -> Self;
+    fn shuffle_lane_words2301(self) -> Self;
+    fn shuffle_lane_words3012(self) -> Self;
+}
 
-    // TODO: make this a part of BitOps
-    /// Exchange neigboring ranges of bits of the specified size
-    pub trait Swap64 {
-        fn swap1(self) -> Self;
-        fn swap2(self) -> Self;
-        fn swap4(self) -> Self;
-        fn swap8(self) -> Self;
-        fn swap16(self) -> Self;
-        fn swap32(self) -> Self;
-        fn swap64(self) -> Self;
-    }
+// TODO: make this a part of BitOps
+/// Exchange neigboring ranges of bits of the specified size
+pub trait Swap64 {
+    fn swap1(self) -> Self;
+    fn swap2(self) -> Self;
+    fn swap4(self) -> Self;
+    fn swap8(self) -> Self;
+    fn swap16(self) -> Self;
+    fn swap32(self) -> Self;
+    fn swap64(self) -> Self;
+}
 
-    pub trait u32x4<M: Machine>:
-        BitOps32
-        + Store<vec128_storage>
-        + ArithOps
-        + Vec4<u32>
-        + Words4
-        + LaneWords4
-        + StoreBytes
-        + MultiLane<[u32; 4]>
-        + Into<vec128_storage>
-    {
+pub trait u32x4<M: Machine>:
+    BitOps32
+    + Store<vec128_storage>
+    + ArithOps
+    + Vec4<u32>
+    + Words4
+    + LaneWords4
+    + StoreBytes
+    + MultiLane<[u32; 4]>
+    + Into<vec128_storage>
+{
 }
-    pub trait u64x2<M: Machine>:
-        BitOps64
-        + Store<vec128_storage>
-        + ArithOps
-        + Vec2<u64>
-        + MultiLane<[u64; 2]>
-        + Into<vec128_storage>
-    {
+pub trait u64x2<M: Machine>:
+    BitOps64 + Store<vec128_storage> + ArithOps + Vec2<u64> + MultiLane<[u64; 2]> + Into<vec128_storage>
+{
 }
-    pub trait u128x1<M: Machine>:
-        BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
-    {
+pub trait u128x1<M: Machine>:
+    BitOps128 + Store<vec128_storage> + Swap64 + MultiLane<[u128; 1]> + Into<vec128_storage>
+{
 }
 
-    pub trait u32x4x2<M: Machine>:
-        BitOps32
-        + Store<vec256_storage>
-        + Vec2<M::u32x4>
-        + MultiLane<[M::u32x4; 2]>
-        + ArithOps
-        + Into<vec256_storage>
-    {
+pub trait u32x4x2<M: Machine>:
+    BitOps32
+    + Store<vec256_storage>
+    + Vec2<M::u32x4>
+    + MultiLane<[M::u32x4; 2]>
+    + ArithOps
+    + Into<vec256_storage>
+    + StoreBytes
+{
 }
-    pub trait u64x2x2<M: Machine>:
-        BitOps64
-        + Store<vec256_storage>
-        + Vec2<M::u64x2>
-        + MultiLane<[M::u64x2; 2]>
-        + ArithOps
-        + StoreBytes
-        + Into<vec256_storage>
-    {
+pub trait u64x2x2<M: Machine>:
+    BitOps64
+    + Store<vec256_storage>
+    + Vec2<M::u64x2>
+    + MultiLane<[M::u64x2; 2]>
+    + ArithOps
+    + StoreBytes
+    + Into<vec256_storage>
+{
 }
-    pub trait u64x4<M: Machine>:
-        BitOps64
-        + Store<vec256_storage>
-        + Vec4<u64>
-        + MultiLane<[u64; 4]>
-        + ArithOps
-        + Words4
-        + StoreBytes
-        + Into<vec256_storage>
-    {
+pub trait u64x4<M: Machine>:
+    BitOps64
+    + Store<vec256_storage>
+    + Vec4<u64>
+    + MultiLane<[u64; 4]>
+    + ArithOps
+    + Words4
+    + StoreBytes
+    + Into<vec256_storage>
+{
 }
-    pub trait u128x2<M: Machine>:
-        BitOps128
-        + Store<vec256_storage>
-        + Vec2<M::u128x1>
-        + MultiLane<[M::u128x1; 2]>
-        + Swap64
-        + Into<vec256_storage>
-    {
+pub trait u128x2<M: Machine>:
+    BitOps128
+    + Store<vec256_storage>
+    + Vec2<M::u128x1>
+    + MultiLane<[M::u128x1; 2]>
+    + Swap64
+    + Into<vec256_storage>
+{
 }
 
-    pub trait u32x4x4<M: Machine>:
-        BitOps32
-        + Store<vec512_storage>
-        + Vec4<M::u32x4>
-        + MultiLane<[M::u32x4; 4]>
-        + ArithOps
-        + LaneWords4
-        + Into<vec512_storage>
-    {
+pub trait u32x4x4<M: Machine>:
+    BitOps32
+    + Store<vec512_storage>
+    + Vec4<M::u32x4>
+    + Vec4Ext<M::u32x4>
+    + Vector<[u32; 16]>
+    + MultiLane<[M::u32x4; 4]>
+    + ArithOps
+    + LaneWords4
+    + Into<vec512_storage>
+    + StoreBytes
+{
 }
-    pub trait u64x2x4<M: Machine>:
-        BitOps64
-        + Store<vec512_storage>
-        + Vec4<M::u64x2>
-        + MultiLane<[M::u64x2; 4]>
-        + ArithOps
-        + Into<vec512_storage>
-    {
+pub trait u64x2x4<M: Machine>:
+    BitOps64
+    + Store<vec512_storage>
+    + Vec4<M::u64x2>
+    + MultiLane<[M::u64x2; 4]>
+    + ArithOps
+    + Into<vec512_storage>
+{
 }
-    // TODO: Words4
-    pub trait u128x4<M: Machine>:
-        BitOps128
-        + Store<vec512_storage>
-        + Vec4<M::u128x1>
-        + MultiLane<[M::u128x1; 4]>
-        + Swap64
-        + Into<vec512_storage>
-    {
+// TODO: Words4
+pub trait u128x4<M: Machine>:
+    BitOps128
+    + Store<vec512_storage>
+    + Vec4<M::u128x1>
+    + MultiLane<[M::u128x1; 4]>
+    + Swap64
+    + Into<vec512_storage>
+{
 }
 
-    /// A vector composed of multiple 128-bit lanes.
-    pub trait MultiLane<Lanes> {
-        /// Split a multi-lane vector into single-lane vectors.
-        fn to_lanes(self) -> Lanes;
-        /// Build a multi-lane vector from individual lanes.
-        fn from_lanes(lanes: Lanes) -> Self;
-    }
+/// A vector composed of multiple 128-bit lanes.
+pub trait MultiLane<Lanes> {
+    /// Split a multi-lane vector into single-lane vectors.
+    fn to_lanes(self) -> Lanes;
+    /// Build a multi-lane vector from individual lanes.
+    fn from_lanes(lanes: Lanes) -> Self;
+}
 
-    /// Combine single vectors into a multi-lane vector.
-    pub trait VZip<V> {
-        fn vzip(self) -> V;
-    }
+/// Combine single vectors into a multi-lane vector.
+pub trait VZip<V> {
+    fn vzip(self) -> V;
+}
 
-    impl<V, T> VZip<V> for T
-    where
-        V: MultiLane<T>,
-    {
-        #[inline(always)]
-        fn vzip(self) -> V {
-            V::from_lanes(self)
-        }
+impl<V, T> VZip<V> for T
+where
+    V: MultiLane<T>,
+{
+    #[inline(always)]
+    fn vzip(self) -> V {
+        V::from_lanes(self)
     }
 }
-pub use self::types::*;
 
 pub trait Machine: Sized + Copy {
     type u32x4: u32x4<Self>;
@@ -264,15 +271,27 @@ pub trait Machine: Sized + Copy {
         unsafe { V::unsafe_read_be(input) }
     }
 
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
     unsafe fn instance() -> Self;
 }
 
 pub trait Store<S> {
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
     unsafe fn unpack(p: S) -> Self;
 }
 
 pub trait StoreBytes {
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
     unsafe fn unsafe_read_le(input: &[u8]) -> Self;
+    /// # Safety
+    /// Caller must ensure the type of Self is appropriate for the hardware of the execution
+    /// environment.
     unsafe fn unsafe_read_be(input: &[u8]) -> Self;
     fn write_le(self, out: &mut [u8]);
     fn write_be(self, out: &mut [u8]);
diff --git a/utils-simd/ppv-lite86/src/x86_64/mod.rs b/utils-simd/ppv-lite86/src/x86_64/mod.rs
index 39d3b900..9d22c0d6 100644
--- a/utils-simd/ppv-lite86/src/x86_64/mod.rs
+++ b/utils-simd/ppv-lite86/src/x86_64/mod.rs
@@ -1,7 +1,8 @@
 // crate minimums: sse2, x86_64
 
-use core::arch::x86_64::{__m128i, __m256i};
 use crate::types::*;
+use core::arch::x86_64::{__m128i, __m256i};
+use zerocopy::{AsBytes, FromBytes, FromZeroes};
 
 mod sse2;
 
@@ -79,7 +80,7 @@ where
     type u64x2 = sse2::u64x2_sse2<YesS3, YesS4, NI>;
     type u128x1 = sse2::u128x1_sse2<YesS3, YesS4, NI>;
 
-    type u32x4x2 = sse2::u32x4x2_sse2<YesS3, YesS4, NI>;
+    type u32x4x2 = sse2::avx2::u32x4x2_avx2<NI>;
     type u64x2x2 = sse2::u64x2x2_sse2<YesS3, YesS4, NI>;
     type u64x4 = sse2::u64x4_sse2<YesS3, YesS4, NI>;
     type u128x2 = sse2::u128x2_sse2<YesS3, YesS4, NI>;
@@ -106,7 +107,8 @@ pub type AVX2 = Avx2Machine<NoNI>;
 /// Converting into and out of this type should be essentially free, although it may be more
 /// aligned than a particular impl requires.
 #[allow(non_camel_case_types)]
-#[derive(Copy, Clone)]
+#[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
+#[repr(C)]
 pub union vec128_storage {
     u32x4: [u32; 4],
     u64x2: [u64; 2],
@@ -119,16 +121,16 @@ impl Store<vec128_storage> for vec128_storage {
         p
     }
 }
-impl<'a> Into<&'a [u32; 4]> for &'a vec128_storage {
+impl<'a> From<&'a vec128_storage> for &'a [u32; 4] {
     #[inline(always)]
-    fn into(self) -> &'a [u32; 4] {
-        unsafe { &self.u32x4 }
+    fn from(x: &'a vec128_storage) -> Self {
+        unsafe { &x.u32x4 }
     }
 }
-impl Into<vec128_storage> for [u32; 4] {
+impl From<[u32; 4]> for vec128_storage {
     #[inline(always)]
-    fn into(self) -> vec128_storage {
-        vec128_storage { u32x4: self }
+    fn from(u32x4: [u32; 4]) -> Self {
+        vec128_storage { u32x4 }
     }
 }
 impl Default for vec128_storage {
@@ -137,6 +139,13 @@ impl Default for vec128_storage {
         vec128_storage { u128x1: [0] }
     }
 }
+impl Eq for vec128_storage {}
+impl PartialEq for vec128_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.u128x1 == rhs.u128x1 }
+    }
+}
 
 #[allow(non_camel_case_types)]
 #[derive(Copy, Clone)]
@@ -147,10 +156,10 @@ pub union vec256_storage {
     sse2: [vec128_storage; 2],
     avx: __m256i,
 }
-impl Into<vec256_storage> for [u64; 4] {
+impl From<[u64; 4]> for vec256_storage {
     #[inline(always)]
-    fn into(self) -> vec256_storage {
-        vec256_storage { u64x4: self }
+    fn from(u64x4: [u64; 4]) -> Self {
+        vec256_storage { u64x4 }
     }
 }
 impl Default for vec256_storage {
@@ -160,13 +169,22 @@ impl Default for vec256_storage {
     }
 }
 impl vec256_storage {
+    #[inline(always)]
     pub fn new128(xs: [vec128_storage; 2]) -> Self {
         Self { sse2: xs }
     }
+    #[inline(always)]
     pub fn split128(self) -> [vec128_storage; 2] {
         unsafe { self.sse2 }
     }
 }
+impl Eq for vec256_storage {}
+impl PartialEq for vec256_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.sse2 == rhs.sse2 }
+    }
+}
 
 #[allow(non_camel_case_types)]
 #[derive(Copy, Clone)]
@@ -186,20 +204,29 @@ impl Default for vec512_storage {
     }
 }
 impl vec512_storage {
+    #[inline(always)]
     pub fn new128(xs: [vec128_storage; 4]) -> Self {
         Self { sse2: xs }
     }
+    #[inline(always)]
     pub fn split128(self) -> [vec128_storage; 4] {
         unsafe { self.sse2 }
     }
 }
+impl Eq for vec512_storage {}
+impl PartialEq for vec512_storage {
+    #[inline(always)]
+    fn eq(&self, rhs: &Self) -> bool {
+        unsafe { self.avx == rhs.avx }
+    }
+}
 
 macro_rules! impl_into {
     ($storage:ident, $array:ty, $name:ident) => {
-        impl Into<$array> for $storage {
+        impl From<$storage> for $array {
             #[inline(always)]
-            fn into(self) -> $array {
-                unsafe { self.$name }
+            fn from(vec: $storage) -> Self {
+                unsafe { vec.$name }
             }
         }
     };
diff --git a/utils-simd/ppv-lite86/src/x86_64/sse2.rs b/utils-simd/ppv-lite86/src/x86_64/sse2.rs
index 81021a99..4b95911d 100644
--- a/utils-simd/ppv-lite86/src/x86_64/sse2.rs
+++ b/utils-simd/ppv-lite86/src/x86_64/sse2.rs
@@ -9,6 +9,7 @@ use core::marker::PhantomData;
 use core::ops::{
     Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Not,
 };
+use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
 
 macro_rules! impl_binop {
     ($vec:ident, $trait:ident, $fn:ident, $impl_fn:ident) => {
@@ -39,7 +40,8 @@ macro_rules! impl_binop_assign {
 macro_rules! def_vec {
     ($vec:ident, $word:ident) => {
         #[allow(non_camel_case_types)]
-        #[derive(Copy, Clone)]
+        #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
+        #[repr(transparent)]
         pub struct $vec<S3, S4, NI> {
             x: __m128i,
             s3: PhantomData<S3>,
@@ -166,49 +168,44 @@ macro_rules! impl_bitops128 {
 
 macro_rules! rotr_32_s3 {
     ($name:ident, $k0:expr, $k1:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-                _mm_shuffle_epi8(
-                    self.x,
-                    _mm_set_epi64x($k0, $k1),
-                )
-            })
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
         }
     };
 }
 macro_rules! rotr_32 {
     ($name:ident, $i:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-            _mm_or_si128(
-                _mm_srli_epi32(self.x, $i as i32),
-                _mm_slli_epi32(self.x, 32 - $i as i32),
-            )
-        })
-    }
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_epi32(self.x, $i as i32),
+                    _mm_slli_epi32(self.x, 32 - $i as i32),
+                )
+            })
+        }
     };
 }
 impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<YesS3, S4, NI> {
     rotr_32!(rotate_each_word_right7, 7);
     rotr_32_s3!(
         rotate_each_word_right8,
-        0x0c0f0e0d_080b0a09,
-        0x04070605_00030201
+        0x0c0f_0e0d_080b_0a09,
+        0x0407_0605_0003_0201
     );
     rotr_32!(rotate_each_word_right11, 11);
     rotr_32!(rotate_each_word_right12, 12);
     rotr_32_s3!(
         rotate_each_word_right16,
-        0x0d0c0f0e_09080b0a,
-        0x05040706_01000302
+        0x0d0c_0f0e_0908_0b0a,
+        0x0504_0706_0100_0302
     );
     rotr_32!(rotate_each_word_right20, 20);
     rotr_32_s3!(
         rotate_each_word_right24,
-        0x0e0d0c0f_0a09080b,
-        0x06050407_02010003
+        0x0e0d_0c0f_0a09_080b,
+        0x0605_0407_0201_0003
     );
     rotr_32!(rotate_each_word_right25, 25);
 }
@@ -228,28 +225,23 @@ impl<S4: Copy, NI: Copy> RotateEachWord32 for u32x4_sse2<NoS3, S4, NI> {
 
 macro_rules! rotr_64_s3 {
     ($name:ident, $k0:expr, $k1:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-                _mm_shuffle_epi8(
-                    self.x,
-                    _mm_set_epi64x($k0, $k1),
-                )
-            })
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe { _mm_shuffle_epi8(self.x, _mm_set_epi64x($k0, $k1)) })
         }
     };
 }
 macro_rules! rotr_64 {
     ($name:ident, $i:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-            _mm_or_si128(
-                _mm_srli_epi64(self.x, $i as i32),
-                _mm_slli_epi64(self.x, 64 - $i as i32),
-            )
-        })
-    }
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_epi64(self.x, $i as i32),
+                    _mm_slli_epi64(self.x, 64 - $i as i32),
+                )
+            })
+        }
     };
 }
 impl<S4: Copy, NI: Copy> RotateEachWord32 for u64x2_sse2<YesS3, S4, NI> {
@@ -296,15 +288,15 @@ impl<S3: Copy, S4: Copy, NI: Copy> RotateEachWord64 for u64x2_sse2<S3, S4, NI> {
 
 macro_rules! rotr_128 {
     ($name:ident, $i:expr) => {
-    #[inline(always)]
-    fn $name(self) -> Self {
-        Self::new(unsafe {
-            _mm_or_si128(
-                _mm_srli_si128(self.x, $i as i32),
-                _mm_slli_si128(self.x, 128 - $i as i32),
-            )
-        })
-    }
+        #[inline(always)]
+        fn $name(self) -> Self {
+            Self::new(unsafe {
+                _mm_or_si128(
+                    _mm_srli_si128(self.x, $i as i32),
+                    _mm_slli_si128(self.x, 128 - $i as i32),
+                )
+            })
+        }
     };
 }
 // TODO: completely unoptimized
@@ -411,7 +403,7 @@ impl<S3, S4, NI> MultiLane<[u128; 1]> for u128x1_sse2<S3, S4, NI> {
     }
     #[inline(always)]
     fn from_lanes(xs: [u128; 1]) -> Self {
-        unimplemented!()
+        unimplemented!("{:?}", xs)
     }
 }
 
@@ -780,7 +772,7 @@ impl<S4, NI> BSwap for u128x1_sse2<YesS3, S4, NI> {
 impl<S4, NI> BSwap for u128x1_sse2<NoS3, S4, NI> {
     #[inline(always)]
     fn bswap(self) -> Self {
-        Self::new(unsafe { unimplemented!() })
+        unimplemented!()
     }
 }
 
@@ -890,6 +882,13 @@ pub type u64x2x4_sse2<S3, S4, NI> = x4<u64x2_sse2<S3, S4, NI>>;
 #[allow(non_camel_case_types)]
 pub type u128x4_sse2<S3, S4, NI> = x4<u128x1_sse2<S3, S4, NI>>;
 
+impl<S3, S4, NI> Vector<[u32; 16]> for u32x4x4_sse2<S3, S4, NI> {
+    #[inline(always)]
+    fn to_scalars(self) -> [u32; 16] {
+        transmute!(self)
+    }
+}
+
 impl<S3: Copy, S4: Copy, NI: Copy> u32x4x2<Machine86<S3, S4, NI>> for u32x4x2_sse2<S3, S4, NI>
 where
     u32x4_sse2<S3, S4, NI>: RotateEachWord32 + BSwap,
@@ -993,6 +992,8 @@ where
     Machine86<S3, S4, NI>: Machine,
     u32x4x4_sse2<S3, S4, NI>: MultiLane<[<Machine86<S3, S4, NI> as Machine>::u32x4; 4]>,
     u32x4x4_sse2<S3, S4, NI>: Vec4<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u32x4x4_sse2<S3, S4, NI>: Vec4Ext<<Machine86<S3, S4, NI> as Machine>::u32x4>,
+    u32x4x4_sse2<S3, S4, NI>: Vector<[u32; 16]>,
 {
 }
 impl<S3: Copy, S4: Copy, NI: Copy> u64x2x4<Machine86<S3, S4, NI>> for u64x2x4_sse2<S3, S4, NI>
@@ -1014,14 +1015,6 @@ where
 {
 }
 
-impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_sse2<YesS3, YesS4, NI>
-where
-    u32x4_sse2<YesS3, YesS4, NI>: RotateEachWord32 + BSwap,
-    Avx2Machine<NI>: Machine,
-    u32x4x4_sse2<YesS3, YesS4, NI>: MultiLane<[<Avx2Machine<NI> as Machine>::u32x4; 4]>,
-    u32x4x4_sse2<YesS3, YesS4, NI>: Vec4<<Avx2Machine<NI> as Machine>::u32x4>,
-{
-}
 impl<NI: Copy> u64x2x4<Avx2Machine<NI>> for u64x2x4_sse2<YesS3, YesS4, NI>
 where
     u64x2_sse2<YesS3, YesS4, NI>: RotateEachWord64 + RotateEachWord32 + BSwap,
@@ -1078,6 +1071,7 @@ impl<W: PartialEq, G> PartialEq for x2<W, G> {
     }
 }
 
+#[allow(unused)]
 #[inline(always)]
 unsafe fn eq128_s4(x: __m128i, y: __m128i) -> bool {
     let q = _mm_shuffle_epi32(_mm_cmpeq_epi64(x, y), 0b1100_0110);
@@ -1136,13 +1130,14 @@ where
 }
 
 #[cfg(test)]
+#[cfg(target_arch = "x86_64")]
 mod test {
     use super::*;
     use crate::x86_64::{SSE2, SSE41, SSSE3};
     use crate::Machine;
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
     fn test_bswap32_s2_vs_s3() {
         let xs = [0x0f0e_0d0c, 0x0b0a_0908, 0x0706_0504, 0x0302_0100];
         let ys = [0x0c0d_0e0f, 0x0809_0a0b, 0x0405_0607, 0x0001_0203];
@@ -1160,12 +1155,12 @@ mod test {
             x_s3.bswap()
         };
 
-        assert_eq!(x_s2, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s2, transmute!(x_s3));
         assert_eq!(x_s2, s2.vec(ys));
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
     fn test_bswap64_s2_vs_s3() {
         let xs = [0x0f0e_0d0c_0b0a_0908, 0x0706_0504_0302_0100];
         let ys = [0x0809_0a0b_0c0d_0e0f, 0x0001_0203_0405_0607];
@@ -1184,11 +1179,11 @@ mod test {
         };
 
         assert_eq!(x_s2, s2.vec(ys));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
     fn test_shuffle32_s2_vs_s3() {
         let xs = [0x0, 0x1, 0x2, 0x3];
         let ys = [0x2, 0x3, 0x0, 0x1];
@@ -1206,7 +1201,7 @@ mod test {
             x_s3.shuffle2301()
         };
         assert_eq!(x_s2, s2.vec(ys));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
 
         let x_s2 = {
             let x_s2: <SSE2 as Machine>::u32x4 = s2.vec(xs);
@@ -1217,16 +1212,16 @@ mod test {
             x_s3.shuffle3012()
         };
         assert_eq!(x_s2, s2.vec(zs));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
 
         let x_s2 = x_s2.shuffle1230();
         let x_s3 = x_s3.shuffle1230();
         assert_eq!(x_s2, s2.vec(xs));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(target_feature = "ssse3"), ignore)]
     fn test_shuffle64_s2_vs_s3() {
         let xs = [0x0, 0x1, 0x2, 0x3];
         let ys = [0x2, 0x3, 0x0, 0x1];
@@ -1244,7 +1239,7 @@ mod test {
             x_s3.shuffle2301()
         };
         assert_eq!(x_s2, s2.vec(ys));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
 
         let x_s2 = {
             let x_s2: <SSE2 as Machine>::u64x4 = s2.vec(xs);
@@ -1255,16 +1250,16 @@ mod test {
             x_s3.shuffle3012()
         };
         assert_eq!(x_s2, s2.vec(zs));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
 
         let x_s2 = x_s2.shuffle1230();
         let x_s3 = x_s3.shuffle1230();
         assert_eq!(x_s2, s2.vec(xs));
-        assert_eq!(x_s3, unsafe { core::mem::transmute(x_s3) });
+        assert_eq!(x_s3, transmute!(x_s3));
     }
 
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
     #[test]
-    #[cfg(target_arch = "x86_64")]
     fn test_lanes_u32x4() {
         let xs = [0x1, 0x2, 0x3, 0x4];
 
@@ -1295,7 +1290,7 @@ mod test {
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
     fn test_lanes_u64x2() {
         let xs = [0x1, 0x2];
 
@@ -1326,7 +1321,6 @@ mod test {
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
     fn test_vec4_u32x4_s2() {
         let xs = [1, 2, 3, 4];
         let s2 = unsafe { SSE2::instance() };
@@ -1342,7 +1336,7 @@ mod test {
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
     fn test_vec4_u32x4_s4() {
         let xs = [1, 2, 3, 4];
         let s4 = unsafe { SSE41::instance() };
@@ -1358,7 +1352,6 @@ mod test {
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
     fn test_vec2_u64x2_s2() {
         let xs = [0x1, 0x2];
         let s2 = unsafe { SSE2::instance() };
@@ -1370,7 +1363,7 @@ mod test {
     }
 
     #[test]
-    #[cfg(target_arch = "x86_64")]
+    #[cfg_attr(not(all(target_feature = "ssse3", target_feature = "sse4.1")), ignore)]
     fn test_vec4_u64x2_s4() {
         let xs = [0x1, 0x2];
         let s4 = unsafe { SSE41::instance() };
@@ -1384,65 +1377,80 @@ mod test {
 
 pub mod avx2 {
     #![allow(non_camel_case_types)]
-    use crate::soft::x4;
+    use crate::soft::{x2, x4};
     use crate::types::*;
-    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2};
+    use crate::x86_64::sse2::{u128x1_sse2, u32x4_sse2, G0};
     use crate::x86_64::{vec256_storage, vec512_storage, Avx2Machine, YesS3, YesS4};
     use core::arch::x86_64::*;
     use core::marker::PhantomData;
     use core::ops::*;
+    use zerocopy::{transmute, AsBytes, FromBytes, FromZeroes};
 
-    #[derive(Copy, Clone)]
-    pub struct u32x4x4_avx2<NI> {
-        x: [__m256i; 2],
+    #[derive(Copy, Clone, FromBytes, AsBytes, FromZeroes)]
+    #[repr(transparent)]
+    pub struct u32x4x2_avx2<NI> {
+        x: __m256i,
         ni: PhantomData<NI>,
     }
 
-    impl<NI> u32x4x4_avx2<NI> {
+    impl<NI> u32x4x2_avx2<NI> {
         #[inline(always)]
-        fn new(x: [__m256i; 2]) -> Self {
+        fn new(x: __m256i) -> Self {
             Self { x, ni: PhantomData }
         }
     }
 
-    impl<NI> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> where NI: Copy {}
-    impl<NI> Store<vec512_storage> for u32x4x4_avx2<NI> {
+    impl<NI> u32x4x2<Avx2Machine<NI>> for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> Store<vec256_storage> for u32x4x2_avx2<NI> {
         #[inline(always)]
-        unsafe fn unpack(p: vec512_storage) -> Self {
-            Self::new([p.avx[0].avx, p.avx[1].avx])
+        unsafe fn unpack(p: vec256_storage) -> Self {
+            Self::new(p.avx)
         }
     }
-    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+    impl<NI> StoreBytes for u32x4x2_avx2<NI> {
         #[inline(always)]
-        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+        unsafe fn unsafe_read_le(input: &[u8]) -> Self {
+            assert_eq!(input.len(), 32);
+            Self::new(_mm256_loadu_si256(input.as_ptr() as *const _))
+        }
+        #[inline(always)]
+        unsafe fn unsafe_read_be(input: &[u8]) -> Self {
+            Self::unsafe_read_le(input).bswap()
+        }
+        #[inline(always)]
+        fn write_le(self, out: &mut [u8]) {
             unsafe {
-                [
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
-                    u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
-                ]
+                assert_eq!(out.len(), 32);
+                _mm256_storeu_si256(out.as_mut_ptr() as *mut _, self.x)
             }
         }
         #[inline(always)]
-        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
-            Self::new(unsafe {
+        fn write_be(self, out: &mut [u8]) {
+            self.bswap().write_le(out)
+        }
+    }
+    impl<NI> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 2]> for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 2] {
+            unsafe {
                 [
-                    _mm256_setr_m128i(x[0].x, x[1].x),
-                    _mm256_setr_m128i(x[2].x, x[3].x),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+                    u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
                 ]
-            })
+            }
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 2]) -> Self {
+            Self::new(unsafe { _mm256_setr_m128i(x[0].x, x[1].x) })
         }
     }
-    impl<NI> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+    impl<NI> Vec2<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x2_avx2<NI> {
         #[inline(always)]
         fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
             unsafe {
                 match i {
-                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 0)),
-                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[0], 1)),
-                    2 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 0)),
-                    3 => u32x4_sse2::new(_mm256_extracti128_si256(self.x[1], 1)),
+                    0 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 0)),
+                    1 => u32x4_sse2::new(_mm256_extracti128_si256(self.x, 1)),
                     _ => panic!(),
                 }
             }
@@ -1451,61 +1459,21 @@ pub mod avx2 {
         fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
             Self::new(unsafe {
                 match i {
-                    0 => [_mm256_inserti128_si256(self.x[0], w.x, 0), self.x[1]],
-                    1 => [_mm256_inserti128_si256(self.x[0], w.x, 1), self.x[1]],
-                    2 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 0)],
-                    3 => [self.x[0], _mm256_inserti128_si256(self.x[1], w.x, 1)],
+                    0 => _mm256_inserti128_si256(self.x, w.x, 0),
+                    1 => _mm256_inserti128_si256(self.x, w.x, 1),
                     _ => panic!(),
                 }
             })
         }
     }
-    impl<NI> LaneWords4 for u32x4x4_avx2<NI> {
-        #[inline(always)]
-        fn shuffle_lane_words1230(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi32(self.x[0], 0b1001_0011),
-                    _mm256_shuffle_epi32(self.x[1], 0b1001_0011),
-                ]
-            })
-        }
-        #[inline(always)]
-        fn shuffle_lane_words2301(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi32(self.x[0], 0b0100_1110),
-                    _mm256_shuffle_epi32(self.x[1], 0b0100_1110),
-                ]
-            })
-        }
-        #[inline(always)]
-        fn shuffle_lane_words3012(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi32(self.x[0], 0b0011_1001),
-                    _mm256_shuffle_epi32(self.x[1], 0b0011_1001),
-                ]
-            })
-        }
-    }
-    impl<NI> BitOps32 for u32x4x4_avx2<NI> where NI: Copy {}
-    impl<NI> ArithOps for u32x4x4_avx2<NI> where NI: Copy {}
+    impl<NI> BitOps32 for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> ArithOps for u32x4x2_avx2<NI> where NI: Copy {}
     macro_rules! shuf_lane_bytes {
         ($name:ident, $k0:expr, $k1:expr) => {
-        #[inline(always)]
-        fn $name(self) -> Self {
-            Self::new(unsafe {
-                [
-                    _mm256_shuffle_epi8(
-                        self.x[0],
-                        _mm256_set_epi64x($k0, $k1, $k0, $k1),
-                    ),
-                    _mm256_shuffle_epi8(
-                        self.x[1],
-                        _mm256_set_epi64x($k0, $k1, $k0, $k1),
-                    )
-                ]
+            #[inline(always)]
+            fn $name(self) -> Self {
+                Self::new(unsafe {
+                    _mm256_shuffle_epi8(self.x, _mm256_set_epi64x($k0, $k1, $k0, $k1))
                 })
             }
         };
@@ -1515,52 +1483,41 @@ pub mod avx2 {
             #[inline(always)]
             fn $name(self) -> Self {
                 Self::new(unsafe {
-                    [
-                        _mm256_or_si256(
-                            _mm256_srli_epi32(self.x[0], $i as i32),
-                            _mm256_slli_epi32(self.x[0], 32 - $i as i32),
-                        ),
-                        _mm256_or_si256(
-                            _mm256_srli_epi32(self.x[1], $i as i32),
-                            _mm256_slli_epi32(self.x[1], 32 - $i as i32),
-                        )
-                    ]
+                    _mm256_or_si256(
+                        _mm256_srli_epi32(self.x, $i as i32),
+                        _mm256_slli_epi32(self.x, 32 - $i as i32),
+                    )
                 })
             }
         };
     }
-    impl<NI: Copy> RotateEachWord32 for u32x4x4_avx2<NI> {
+    impl<NI: Copy> RotateEachWord32 for u32x4x2_avx2<NI> {
         rotr_32!(rotate_each_word_right7, 7);
         shuf_lane_bytes!(
             rotate_each_word_right8,
-            0x0c0f0e0d_080b0a09,
-            0x04070605_00030201
+            0x0c0f_0e0d_080b_0a09,
+            0x0407_0605_0003_0201
         );
         rotr_32!(rotate_each_word_right11, 11);
         rotr_32!(rotate_each_word_right12, 12);
         shuf_lane_bytes!(
             rotate_each_word_right16,
-            0x0d0c0f0e_09080b0a,
-            0x05040706_01000302
+            0x0d0c_0f0e_0908_0b0a,
+            0x0504_0706_0100_0302
         );
         rotr_32!(rotate_each_word_right20, 20);
         shuf_lane_bytes!(
             rotate_each_word_right24,
-            0x0e0d0c0f_0a09080b,
-            0x06050407_02010003
+            0x0e0d_0c0f_0a09_080b,
+            0x0605_0407_0201_0003
         );
         rotr_32!(rotate_each_word_right25, 25);
     }
-    impl<NI> BitOps0 for u32x4x4_avx2<NI> where NI: Copy {}
-    impl<NI> From<u32x4x4_avx2<NI>> for vec512_storage {
+    impl<NI> BitOps0 for u32x4x2_avx2<NI> where NI: Copy {}
+    impl<NI> From<u32x4x2_avx2<NI>> for vec256_storage {
         #[inline(always)]
-        fn from(x: u32x4x4_avx2<NI>) -> Self {
-            Self {
-                avx: [
-                    vec256_storage { avx: x.x[0] },
-                    vec256_storage { avx: x.x[1] },
-                ],
-            }
+        fn from(x: u32x4x2_avx2<NI>) -> Self {
+            Self { avx: x.x }
         }
     }
 
@@ -1577,55 +1534,172 @@ pub mod avx2 {
             }
         };
     }
-    impl_assign!(u32x4x4_avx2, BitXorAssign, bitxor_assign, bitxor);
-    impl_assign!(u32x4x4_avx2, BitOrAssign, bitor_assign, bitor);
-    impl_assign!(u32x4x4_avx2, BitAndAssign, bitand_assign, bitand);
-    impl_assign!(u32x4x4_avx2, AddAssign, add_assign, add);
+    impl_assign!(u32x4x2_avx2, BitXorAssign, bitxor_assign, bitxor);
+    impl_assign!(u32x4x2_avx2, BitOrAssign, bitor_assign, bitor);
+    impl_assign!(u32x4x2_avx2, BitAndAssign, bitand_assign, bitand);
+    impl_assign!(u32x4x2_avx2, AddAssign, add_assign, add);
 
-    macro_rules! impl_bitop_x2 {
+    macro_rules! impl_bitop {
         ($vec:ident, $Op:ident, $op_fn:ident, $impl_fn:ident) => {
             impl<NI> $Op for $vec<NI> {
                 type Output = Self;
                 #[inline(always)]
                 fn $op_fn(self, rhs: Self) -> Self::Output {
-                    Self::new(unsafe {
-                        [$impl_fn(self.x[0], rhs.x[0]), $impl_fn(self.x[1], rhs.x[1])]
-                    })
+                    Self::new(unsafe { $impl_fn(self.x, rhs.x) })
                 }
             }
         };
     }
-    impl_bitop_x2!(u32x4x4_avx2, BitXor, bitxor, _mm256_xor_si256);
-    impl_bitop_x2!(u32x4x4_avx2, BitOr, bitor, _mm256_or_si256);
-    impl_bitop_x2!(u32x4x4_avx2, BitAnd, bitand, _mm256_and_si256);
-    impl_bitop_x2!(u32x4x4_avx2, AndNot, andnot, _mm256_andnot_si256);
-    impl_bitop_x2!(u32x4x4_avx2, Add, add, _mm256_add_epi32);
+    impl_bitop!(u32x4x2_avx2, BitXor, bitxor, _mm256_xor_si256);
+    impl_bitop!(u32x4x2_avx2, BitOr, bitor, _mm256_or_si256);
+    impl_bitop!(u32x4x2_avx2, BitAnd, bitand, _mm256_and_si256);
+    impl_bitop!(u32x4x2_avx2, AndNot, andnot, _mm256_andnot_si256);
+    impl_bitop!(u32x4x2_avx2, Add, add, _mm256_add_epi32);
 
-    impl<NI> Not for u32x4x4_avx2<NI> {
+    impl<NI> Not for u32x4x2_avx2<NI> {
         type Output = Self;
         #[inline(always)]
         fn not(self) -> Self::Output {
             unsafe {
                 let f = _mm256_set1_epi8(-0x7f);
-                Self::new([f, f]) ^ self
+                Self::new(f) ^ self
             }
         }
     }
 
-    impl<NI> BSwap for u32x4x4_avx2<NI> {
+    impl<NI> BSwap for u32x4x2_avx2<NI> {
         shuf_lane_bytes!(bswap, 0x0c0d_0e0f_0809_0a0b, 0x0405_0607_0001_0203);
     }
 
-    impl<NI> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI>
+    impl<NI> From<x2<u128x1_sse2<YesS3, YesS4, NI>, G0>> for u32x4x2_avx2<NI>
     where
         NI: Copy,
     {
+        #[inline(always)]
+        fn from(x: x2<u128x1_sse2<YesS3, YesS4, NI>, G0>) -> Self {
+            Self::new(unsafe { _mm256_setr_m128i(x.0[0].x, x.0[1].x) })
+        }
+    }
+
+    impl<NI> LaneWords4 for u32x4x2_avx2<NI> {
+        #[inline(always)]
+        fn shuffle_lane_words1230(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b1001_0011) })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words2301(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0100_1110) })
+        }
+        #[inline(always)]
+        fn shuffle_lane_words3012(self) -> Self {
+            Self::new(unsafe { _mm256_shuffle_epi32(self.x, 0b0011_1001) })
+        }
+    }
+
+    ///////////////////////////////////////////////////////////////////////////////////////////
+
+    pub type u32x4x4_avx2<NI> = x2<u32x4x2_avx2<NI>, G0>;
+    impl<NI: Copy> u32x4x4<Avx2Machine<NI>> for u32x4x4_avx2<NI> {}
+
+    impl<NI: Copy> Store<vec512_storage> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        unsafe fn unpack(p: vec512_storage) -> Self {
+            Self::new([
+                u32x4x2_avx2::unpack(p.avx[0]),
+                u32x4x2_avx2::unpack(p.avx[1]),
+            ])
+        }
+    }
+    impl<NI: Copy> MultiLane<[u32x4_sse2<YesS3, YesS4, NI>; 4]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_lanes(self) -> [u32x4_sse2<YesS3, YesS4, NI>; 4] {
+            let [a, b] = self.0[0].to_lanes();
+            let [c, d] = self.0[1].to_lanes();
+            [a, b, c, d]
+        }
+        #[inline(always)]
+        fn from_lanes(x: [u32x4_sse2<YesS3, YesS4, NI>; 4]) -> Self {
+            let ab = u32x4x2_avx2::from_lanes([x[0], x[1]]);
+            let cd = u32x4x2_avx2::from_lanes([x[2], x[3]]);
+            Self::new([ab, cd])
+        }
+    }
+    impl<NI: Copy> Vec4<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn extract(self, i: u32) -> u32x4_sse2<YesS3, YesS4, NI> {
+            match i {
+                0 => self.0[0].extract(0),
+                1 => self.0[0].extract(1),
+                2 => self.0[1].extract(0),
+                3 => self.0[1].extract(1),
+                _ => panic!(),
+            }
+        }
+        #[inline(always)]
+        fn insert(self, w: u32x4_sse2<YesS3, YesS4, NI>, i: u32) -> Self {
+            Self::new(match i {
+                0 | 1 => [self.0[0].insert(w, i), self.0[1]],
+                2 | 3 => [self.0[0], self.0[1].insert(w, i - 2)],
+                _ => panic!(),
+            })
+        }
+    }
+    impl<NI: Copy> Vec4Ext<u32x4_sse2<YesS3, YesS4, NI>> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn transpose4(a: Self, b: Self, c: Self, d: Self) -> (Self, Self, Self, Self) {
+            /*
+             * a00:a01 a10:a11
+             * b00:b01 b10:b11
+             * c00:c01 c10:c11
+             * d00:d01 d10:d11
+             *       =>
+             * a00:b00 c00:d00
+             * a01:b01 c01:d01
+             * a10:b10 c10:d10
+             * a11:b11 c11:d11
+             */
+            unsafe {
+                let ab00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x20));
+                let ab01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[0].x, b.0[0].x, 0x31));
+                let ab10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x20));
+                let ab11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(a.0[1].x, b.0[1].x, 0x31));
+                let cd00 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x20));
+                let cd01 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[0].x, d.0[0].x, 0x31));
+                let cd10 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x20));
+                let cd11 = u32x4x2_avx2::new(_mm256_permute2x128_si256(c.0[1].x, d.0[1].x, 0x31));
+                (
+                    Self::new([ab00, cd00]),
+                    Self::new([ab01, cd01]),
+                    Self::new([ab10, cd10]),
+                    Self::new([ab11, cd11]),
+                )
+            }
+        }
+    }
+    impl<NI: Copy> Vector<[u32; 16]> for u32x4x4_avx2<NI> {
+        #[inline(always)]
+        fn to_scalars(self) -> [u32; 16] {
+            transmute!(self)
+        }
+    }
+    impl<NI: Copy> From<u32x4x4_avx2<NI>> for vec512_storage {
+        #[inline(always)]
+        fn from(x: u32x4x4_avx2<NI>) -> Self {
+            Self {
+                avx: [
+                    vec256_storage { avx: x.0[0].x },
+                    vec256_storage { avx: x.0[1].x },
+                ],
+            }
+        }
+    }
+    impl<NI: Copy> From<x4<u128x1_sse2<YesS3, YesS4, NI>>> for u32x4x4_avx2<NI> {
         #[inline(always)]
         fn from(x: x4<u128x1_sse2<YesS3, YesS4, NI>>) -> Self {
             Self::new(unsafe {
                 [
-                    _mm256_setr_m128i(x.0[0].x, x.0[1].x),
-                    _mm256_setr_m128i(x.0[2].x, x.0[3].x),
+                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[0].x, x.0[1].x)),
+                    u32x4x2_avx2::new(_mm256_setr_m128i(x.0[2].x, x.0[3].x)),
                 ]
             })
         }
diff --git a/utils-simd/ppv-null/Cargo.toml b/utils-simd/ppv-null/Cargo.toml
index ce778a74..d18b096e 100644
--- a/utils-simd/ppv-null/Cargo.toml
+++ b/utils-simd/ppv-null/Cargo.toml
@@ -2,12 +2,13 @@
 name = "ppv-null"
 version = "0.2.0"
 authors = ["The CryptoCorrosion Contributors"]
-edition = "2018"
+edition = "2021"
 license = "MIT/Apache-2.0"
 description = "Safe, portable, non-SIMD implementation of the crypto-simd API"
 repository = "https://github.com/cryptocorrosion/cryptocorrosion"
 keywords = ["crypto", "simd"]
 categories = ["cryptography", "no-std"]
+rust-version = "1.61"
 
 [dependencies]
 crypto-simd = "0.1"