From 7e4631ba6143b2495d6e66ba1703c7c72b419452 Mon Sep 17 00:00:00 2001
From: Elabajaba <Elabajaba@users.noreply.github.com>
Date: Sat, 14 Dec 2024 01:25:47 -0500
Subject: [PATCH] WIP port to wide from std-simd

---
 Cargo.toml             |   7 +-
 rust-toolchain         |   2 +-
 src/animation.rs       |  41 +++++-----
 src/blending_job.rs    |  19 ++---
 src/ik_aim_job.rs      |  13 ++--
 src/ik_two_bone_job.rs |  21 +++---
 src/lib.rs             |   2 -
 src/math.rs            | 166 ++++++++++++++++++++---------------------
 src/sampling_job.rs    |  13 ++--
 src/skeleton.rs        |  42 +++++------
 10 files changed, 163 insertions(+), 163 deletions(-)

diff --git a/Cargo.toml b/Cargo.toml
index 659d7a4..95be788 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -19,17 +19,22 @@ serde = ["dep:serde", "glam/serde", "bimap/serde" ]
 rkyv = ["dep:rkyv", "dep:bytecheck", "glam/rkyv", "glam/bytecheck"]
 wasm = []
 nodejs = ["wasm", "dep:js-sys", "dep:wasm-bindgen"]
+archive_le = []
+archive_be = []
 
 [dependencies]
 bimap = { version = "0.6" }
 bytecheck = { version = "0.6", optional = true, default-features = false }
-glam = { version = "0.27", features = [ "core-simd", "libm" ] }
+glam = { version = "0.27", features = [ "libm" ] }
+# glam = { version = "0.27", features = [ "core-simd", "libm" ] }
 js-sys = { version = "0.3", optional = true }
 rkyv = { version = "0.7", optional = true, features = [ "validation" ] }
 serde = { version= "1.0", optional = true, features = [ "serde_derive" ] }
 static_assertions = "1.1"
 thiserror = "1.0"
 wasm-bindgen = { version = "0.2", optional = true }
+# wide = "0.7.30"
+wide = { git = "https://github.com/Lokathor/wide.git", rev = "0f15f92bff68e36bf60a7305d9201873dd26d6b9" }
 
 [dev-dependencies]
 getrandom = { version = "0.2", features = ["js"] }
diff --git a/rust-toolchain b/rust-toolchain
index 07ade69..5e49e6b 100644
--- a/rust-toolchain
+++ b/rust-toolchain
@@ -1 +1 @@
-nightly
\ No newline at end of file
+1.83
\ No newline at end of file
diff --git a/src/animation.rs b/src/animation.rs
index 26b00dd..16a539e 100644
--- a/src/animation.rs
+++ b/src/animation.rs
@@ -5,9 +5,8 @@
 use glam::{Quat, Vec3, Vec4};
 use std::alloc::{self, Layout};
 use std::io::Read;
-use std::simd::prelude::*;
-use std::simd::*;
 use std::{mem, slice};
+use wide::{f32x4, i32x4};
 
 use crate::archive::{Archive, ArchiveRead};
 use crate::base::{align_ptr, align_usize, OzzError};
@@ -102,15 +101,15 @@ impl QuaternionKey {
         k3: &QuaternionKey,
         soa: &mut SoaQuat,
     ) {
-        const MASK_F000:i32x4 = i32x4::from_array([-1i32, 0, 0, 0]);
-        const MASK_0F00:i32x4 = i32x4::from_array([0, -1i32, 0, 0]);
-        const MASK_00F0:i32x4 = i32x4::from_array([0, 0, -1i32, 0]);
-        const MASK_000F:i32x4 = i32x4::from_array([0, 0, 0, -1i32]);
+        const MASK_F000:i32x4 = i32x4::new([-1i32, 0, 0, 0]);
+        const MASK_0F00:i32x4 = i32x4::new([0, -1i32, 0, 0]);
+        const MASK_00F0:i32x4 = i32x4::new([0, 0, -1i32, 0]);
+        const MASK_000F:i32x4 = i32x4::new([0, 0, 0, -1i32]);
 
         const MAPPING: [[usize; 4]; 4] = [[0, 0, 1, 2], [0, 0, 1, 2], [0, 1, 0, 2], [0, 1, 2, 0]];
 
-        const SCALE: f32x4 = f32x4::from_array([core::f32::consts::SQRT_2 / 32767.0; 4]);
-        const OFFSET: f32x4 = f32x4::from_array([-core::f32::consts::SQRT_2 / 2.0; 4]);
+        const SCALE: f32x4 = f32x4::new([core::f32::consts::SQRT_2 / 32767.0; 4]);
+        const OFFSET: f32x4 = f32x4::new([-core::f32::consts::SQRT_2 / 2.0; 4]);
 
         let (largest0, sign0, value0) = k0.unpack();
         let (largest1, sign1, value1) = k1.unpack();
@@ -123,10 +122,10 @@ impl QuaternionKey {
         let m3 = &MAPPING[largest3 as usize];
 
         let cmp_keys: [f32x4; 4] = [
-            f32x4::from_array([ value0[m0[0]] as f32, value1[m1[0]] as f32, value2[m2[0]] as f32, value3[m3[0]] as f32 ]),
-            f32x4::from_array([ value0[m0[1]] as f32, value1[m1[1]] as f32, value2[m2[1]] as f32, value3[m3[1]] as f32 ]),
-            f32x4::from_array([ value0[m0[2]] as f32, value1[m1[2]] as f32, value2[m2[2]] as f32, value3[m3[2]] as f32 ]),
-            f32x4::from_array([ value0[m0[3]] as f32, value1[m1[3]] as f32, value2[m2[3]] as f32, value3[m3[3]] as f32 ]),
+            f32x4::new([ value0[m0[0]] as f32, value1[m1[0]] as f32, value2[m2[0]] as f32, value3[m3[0]] as f32 ]),
+            f32x4::new([ value0[m0[1]] as f32, value1[m1[1]] as f32, value2[m2[1]] as f32, value3[m3[1]] as f32 ]),
+            f32x4::new([ value0[m0[2]] as f32, value1[m1[2]] as f32, value2[m2[2]] as f32, value3[m3[2]] as f32 ]),
+            f32x4::new([ value0[m0[3]] as f32, value1[m1[3]] as f32, value2[m2[3]] as f32, value3[m3[3]] as f32 ]),
         ]; // TODO: simd int to float
 
         let mut cpnt = [
@@ -141,9 +140,9 @@ impl QuaternionKey {
         cpnt[largest3 as usize] = fx4(ix4(cpnt[largest3 as usize]) & !MASK_000F);
 
         let dot = cpnt[0] * cpnt[0] + cpnt[1] * cpnt[1] + cpnt[2] * cpnt[2] + cpnt[3] * cpnt[3];
-        let ww0 =  f32x4::simd_max(ZERO, ONE - dot); // prevent NaN, different from C++ code
+        let ww0 =  f32x4::fast_max(ZERO, ONE - dot); // prevent NaN, different from C++ code
         let w0 = ww0.sqrt();
-        let sign = i32x4::from_array([sign0 as i32, sign1 as i32, sign2 as i32, sign3 as i32]) << 31;
+        let sign = i32x4::new([sign0 as i32, sign1 as i32, sign2 as i32, sign3 as i32]) << 31;
         let restored = ix4(w0) | sign;
 
         cpnt[largest0 as usize] = fx4(ix4(cpnt[largest0 as usize]) | (restored & MASK_F000));
@@ -1116,14 +1115,14 @@ mod tests {
         assert_eq!(
             soa,
             SoaVec3 {
-                x: f32x4::from_array([0.0711059570, 0.0251312255859375, 0.0711059570, 0.0251312255859375]),
-                y: f32x4::from_array([
+                x: f32x4::new([0.0711059570, 0.0251312255859375, 0.0711059570, 0.0251312255859375]),
+                y: f32x4::new([
                     -8.77380371e-05,
                     5.960464477539063e-8,
                     -8.77380371e-05,
                     5.960464477539063e-8
                 ]),
-                z: f32x4::from_array([1.84774399e-06, 0.0, 1.84774399e-06, 0.0]),
+                z: f32x4::new([1.84774399e-06, 0.0, 1.84774399e-06, 0.0]),
             }
         );
     }
@@ -1172,10 +1171,10 @@ mod tests {
         assert_eq!(
             soa,
             SoaQuat {
-                x: f32x4::from_array([-0.491480947, -0.498861253, -0.00912827253, 0.00852406025]),
-                y: f32x4::from_array([-0.508615375, -0.501123607, 0.0251405239, 0.00882613659]),
-                z: f32x4::from_array([-0.538519204, -0.498861253, -0.0326502919, 0.00610709190]),
-                w: f32x4::from_array([0.457989037, 0.501148760, 0.999108911, 0.999906063]),
+                x: f32x4::new([-0.491480947, -0.498861253, -0.00912827253, 0.00852406025]),
+                y: f32x4::new([-0.508615375, -0.501123607, 0.0251405239, 0.00882613659]),
+                z: f32x4::new([-0.538519204, -0.498861253, -0.0326502919, 0.00610709190]),
+                w: f32x4::new([0.457989037, 0.501148760, 0.999108911, 0.999906063]),
             }
         );
     }
diff --git a/src/blending_job.rs b/src/blending_job.rs
index a4b22cb..19d74b0 100644
--- a/src/blending_job.rs
+++ b/src/blending_job.rs
@@ -3,18 +3,15 @@
 //!
 
 use glam::Vec4;
+use wide::f32x4;
 use std::cell::RefCell;
 use std::rc::Rc;
-use std::simd::prelude::*;
 use std::sync::{Arc, RwLock};
 
 use crate::base::{OzzBuf, OzzError, OzzMutBuf, OzzObj};
-use crate::math::{fx4_from_vec4, fx4_sign, SoaQuat, SoaTransform, SoaVec3};
+use crate::math::{fx4_from_vec4, fx4_sign, SoaQuat, SoaTransform, SoaVec3, ONE, ZERO};
 use crate::skeleton::Skeleton;
 
-const ZERO: f32x4 = f32x4::from_array([0.0; 4]);
-const ONE: f32x4 = f32x4::from_array([1.0; 4]);
-
 /// Defines a layer of blending input data (local space transforms) and parameters (weights).
 #[derive(Debug, Clone)]
 pub struct BlendingLayer<I: OzzBuf<SoaTransform>> {
@@ -352,13 +349,13 @@ where
 
                 if ctx.num_passes == 0 {
                     for idx in 0..num_soa_joints {
-                        let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
+                        let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
                         ctx.accumulated_weights[idx] = weight;
                         Self::blend_1st_pass(&transform[idx], weight, &mut output[idx]);
                     }
                 } else {
                     for idx in 0..num_soa_joints {
-                        let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
+                        let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
                         ctx.accumulated_weights[idx] += weight;
                         Self::blend_n_pass(&transform[idx], weight, &mut output[idx]);
                     }
@@ -402,8 +399,8 @@ where
         } else {
             let simd_threshold = f32x4::splat(threshold);
             for idx in 0..joint_rest_poses.len() {
-                let bp_weight = (simd_threshold - ctx.accumulated_weights[idx]).simd_max(ZERO);
-                ctx.accumulated_weights[idx] = simd_threshold.simd_max(ctx.accumulated_weights[idx]);
+                let bp_weight = (simd_threshold - ctx.accumulated_weights[idx]).fast_max(ZERO);
+                ctx.accumulated_weights[idx] = simd_threshold.fast_max(ctx.accumulated_weights[idx]);
                 Self::blend_n_pass(&joint_rest_poses[idx], bp_weight, &mut output[idx]);
             }
         }
@@ -450,7 +447,7 @@ where
 
                 if !layer.joint_weights.is_empty() {
                     for idx in 0..joint_rest_poses.len() {
-                        let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
+                        let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
                         let one_minus_weight = ONE - weight;
                         Self::blend_add_pass(&transform[idx], weight, one_minus_weight, &mut output[idx]);
                     }
@@ -465,7 +462,7 @@ where
 
                 if !layer.joint_weights.is_empty() {
                     for idx in 0..joint_rest_poses.len() {
-                        let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
+                        let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
                         let one_minus_weight = ONE - weight;
                         Self::blend_sub_pass(&transform[idx], weight, one_minus_weight, &mut output[idx]);
                     }
diff --git a/src/ik_aim_job.rs b/src/ik_aim_job.rs
index a17b0c0..aeb1f79 100644
--- a/src/ik_aim_job.rs
+++ b/src/ik_aim_job.rs
@@ -3,8 +3,7 @@
 //!
 
 use glam::{Mat4, Quat, Vec3A};
-use std::simd::prelude::*;
-use std::simd::StdFloat;
+use wide::{f32x4, CmpEq, CmpGt, CmpNe};
 
 use crate::base::OzzError;
 use crate::math::*;
@@ -234,7 +233,7 @@ impl IKAimJob {
 
         let offsetted_forward = Self::compute_offsetted_forward(self.forward, self.offset, joint_to_target_js);
         self.reached = offsetted_forward.is_some();
-        if !self.reached || (joint_to_target_js_len2.simd_eq(ZERO).to_bitmask() & 0x1 == 0x1) {
+        if !self.reached || (joint_to_target_js_len2.cmp_eq(ZERO).to_bitmask() & 0x1 == 0x1) {
             self.joint_correction = QUAT_UNIT;
             return Ok(());
         }
@@ -256,7 +255,7 @@ impl IKAimJob {
 
         let rotate_plane_axis_js;
         let rotate_plane_js;
-        if denoms.simd_ne(ZERO).to_bitmask() & 0x7 == 0x7 {
+        if denoms.cmp_ne(ZERO).to_bitmask() & 0x7 == 0x7 {
             let rsqrts = denoms.sqrt().recip();
             rotate_plane_axis_js = joint_to_target_js * fx4_splat_x(rsqrts);
 
@@ -268,7 +267,7 @@ impl IKAimJob {
             let rotate_plane_axis_flipped_js = fx4_xor(rotate_plane_axis_js, axis_flip);
             rotate_plane_js = quat_from_cos_angle(
                 rotate_plane_axis_flipped_js,
-                rotate_plane_cos_angle.simd_clamp(NEG_ONE, ONE),
+                rotate_plane_cos_angle.fast_max(NEG_ONE).fast_min(ONE), // clamp elements between -1.0 and 1.0
             );
         } else {
             rotate_plane_axis_js = joint_to_target_js * fx4_splat_x(denoms.sqrt().recip());
@@ -284,7 +283,7 @@ impl IKAimJob {
 
         let twisted_fu = quat_positive_w(twisted);
         if self.weight < 1.0 {
-            let simd_weight = f32x4::splat(self.weight).simd_max(ZERO);
+            let simd_weight = f32x4::splat(self.weight).fast_max(ZERO);
             self.joint_correction = quat_normalize(fx4_lerp(QUAT_UNIT, twisted_fu, simd_weight));
         } else {
             self.joint_correction = twisted_fu;
@@ -296,7 +295,7 @@ impl IKAimJob {
         let ao_l = vec3_dot_s(forward, offset);
         let ac_l2 = vec3_length2_s(offset) - ao_l * ao_l;
         let r2 = vec3_length2_s(target);
-        if ac_l2.simd_gt(r2).to_bitmask() & 0x1 == 0x1 {
+        if ac_l2.cmp_gt(r2).to_bitmask() & 0x1 == 0x1 {
             return None;
         }
         let ai_l = (r2 - ac_l2).sqrt();
diff --git a/src/ik_two_bone_job.rs b/src/ik_two_bone_job.rs
index ebf3065..64aa4dc 100644
--- a/src/ik_two_bone_job.rs
+++ b/src/ik_two_bone_job.rs
@@ -3,8 +3,9 @@
 //!
 
 use glam::{Mat4, Quat, Vec3A};
-use std::simd::prelude::*;
-use std::simd::StdFloat;
+// use std::simd::prelude::*;
+// use std::simd::StdFloat;
+use wide::{f32x4, CmpGt};
 
 use crate::base::OzzError;
 use crate::math::*;
@@ -339,12 +340,12 @@ impl IKTwoBoneJob {
         let start_target_original_ss_len = fx4_splat_z(lengths); // [x y z w]
         let bone_len_diff_abs = (start_mid_ss_len - mid_end_ss_len).abs(); // [x]
         let bones_chain_len = start_mid_ss_len + mid_end_ss_len; // [x]
-        let da = bones_chain_len * fx4_clamp_or_min(f32x4::from_array([self.soften, 0.0, 0.0, 0.0]), ZERO, ONE); // [x 0 0 0] da.yzw needs to be 0
+        let da = bones_chain_len * fx4_clamp_or_min(f32x4::new([self.soften, 0.0, 0.0, 0.0]), ZERO, ONE); // [x 0 0 0] da.yzw needs to be 0
         let ds = bones_chain_len - da; // [x]
 
         let left = fx4_set_w(start_target_original_ss_len, ds); // [x y z w]
         let right = fx4_set_z(da, bone_len_diff_abs); // [x y z w]
-        let comp_mask = left.simd_gt(right).to_bitmask();
+        let comp_mask = left.cmp_gt(right).to_bitmask();
 
         let start_target_ss;
         let start_target_ss_len2;
@@ -410,7 +411,7 @@ impl IKTwoBoneJob {
 
         let mut start_rot_ss = end_to_target_rot_ss;
 
-        if start_target_ss_len2.simd_gt(ZERO).to_bitmask() & 0x1 == 0x1 {
+        if start_target_ss_len2.cmp_gt(ZERO).to_bitmask() & 0x1 == 0x1 {
             // [x]
             let ref_plane_normal_ss = vec3_cross(start_target_ss, pole_ss); // [x y z]
             let ref_plane_normal_ss_len2 = vec3_length2_s(ref_plane_normal_ss); // [x]
@@ -439,7 +440,7 @@ impl IKTwoBoneJob {
 
             let rotate_plane_ss = quat_from_cos_angle(
                 rotate_plane_axis_flipped_ss,
-                rotate_plane_cos_angle.simd_clamp(NEG_ONE, ONE),
+                rotate_plane_cos_angle.fast_max(NEG_ONE).fast_min(ONE), // clamp elements between -1.0 and 1.0
             );
 
             if self.twist_angle != 0.0 {
@@ -457,14 +458,14 @@ impl IKTwoBoneJob {
         let mid_rot_fu = quat_positive_w(mid_rot);
 
         if self.weight < 1.0 {
-            let simd_weight = f32x4::splat(self.weight).simd_max(ZERO);
+            let simd_weight = f32x4::splat(self.weight).fast_max(ZERO);
 
             let start_lerp = fx4_lerp(QUAT_UNIT, start_rot_fu, simd_weight);
             let mid_lerp = fx4_lerp(QUAT_UNIT, mid_rot_fu, simd_weight);
 
-            let rsqrts = f32x4::from_array([
-                (start_lerp * start_lerp).reduce_sum(),
-                (mid_lerp * mid_lerp).reduce_sum(),
+            let rsqrts = f32x4::new([
+                (start_lerp * start_lerp).reduce_add(),
+                (mid_lerp * mid_lerp).reduce_add(),
                 0.0,
                 0.0,
             ])
diff --git a/src/lib.rs b/src/lib.rs
index b41b6cc..0d6b708 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -45,8 +45,6 @@
 //! ```
 //!
 
-#![feature(portable_simd)]
-
 pub mod animation;
 pub mod archive;
 pub mod base;
diff --git a/src/math.rs b/src/math.rs
index fda6f54..a0d0fe6 100644
--- a/src/math.rs
+++ b/src/math.rs
@@ -11,30 +11,30 @@ use static_assertions::const_assert_eq;
 use std::fmt::Debug;
 use std::io::Read;
 use std::mem;
-use std::simd::prelude::*;
-use std::simd::*;
 
+use wide::{f32x4, i32x4};
+// use wide::f32x4
 use crate::archive::{Archive, ArchiveRead};
 use crate::base::OzzError;
 use crate::math;
 
-pub(crate) const ZERO: f32x4 = f32x4::from_array([0.0; 4]);
-pub(crate) const ONE: f32x4 = f32x4::from_array([1.0; 4]);
-pub(crate) const TWO: f32x4 = f32x4::from_array([2.0; 4]);
-pub(crate) const THREE: f32x4 = f32x4::from_array([3.0; 4]);
-pub(crate) const NEG_ONE: f32x4 = f32x4::from_array([-1.0; 4]);
-pub(crate) const FRAC_1_2: f32x4 = f32x4::from_array([0.5; 4]);
-pub(crate) const FRAC_2_PI: f32x4 = f32x4::from_array([core::f32::consts::FRAC_2_PI; 4]);
-pub(crate) const FRAC_PI_2: f32x4 = f32x4::from_array([core::f32::consts::FRAC_PI_2; 4]);
+pub(crate) const ZERO: f32x4 = f32x4::ZERO;
+pub(crate) const ONE: f32x4 = f32x4::ONE;
+pub(crate) const TWO: f32x4 = f32x4::new([2.0; 4]);
+pub(crate) const THREE: f32x4 = f32x4::new([3.0; 4]);
+pub(crate) const NEG_ONE: f32x4 = f32x4::new([-1.0; 4]);
+pub(crate) const FRAC_1_2: f32x4 = f32x4::HALF;
+pub(crate) const FRAC_2_PI: f32x4 = f32x4::FRAC_2_PI;
+pub(crate) const FRAC_PI_2: f32x4 = f32x4::FRAC_PI_2;
 
-pub(crate) const X_AXIS: f32x4 = f32x4::from_array([1.0, 0.0, 0.0, 0.0]);
-pub(crate) const Y_AXIS: f32x4 = f32x4::from_array([0.0, 1.0, 0.0, 0.0]);
-pub(crate) const Z_AXIS: f32x4 = f32x4::from_array([0.0, 0.0, 1.0, 0.0]);
+pub(crate) const X_AXIS: f32x4 = f32x4::new([1.0, 0.0, 0.0, 0.0]);
+pub(crate) const Y_AXIS: f32x4 = f32x4::new([0.0, 1.0, 0.0, 0.0]);
+pub(crate) const Z_AXIS: f32x4 = f32x4::new([0.0, 0.0, 1.0, 0.0]);
 
-pub(crate) const QUAT_UNIT: f32x4 = f32x4::from_array([0.0, 0.0, 0.0, 1.0]);
+pub(crate) const QUAT_UNIT: f32x4 = f32x4::new([0.0, 0.0, 0.0, 1.0]);
 
-const SIGN: i32x4 = i32x4::from_array([i32::MIN; 4]);
-const SIGN_W: i32x4 = i32x4::from_array([0, 0, 0, i32::MIN]);
+pub(super) const SIGN: i32x4 = i32x4::new([i32::MIN; 4]);
+pub(super) const SIGN_W: i32x4 = i32x4::new([0, 0, 0, i32::MIN]);
 
 //
 // SoaVec3
@@ -53,9 +53,9 @@ impl SoaVec3 {
     #[inline]
     pub const fn new(x: [f32; 4], y: [f32; 4], z: [f32; 4]) -> SoaVec3 {
         SoaVec3 {
-            x: f32x4::from_array(x),
-            y: f32x4::from_array(y),
-            z: f32x4::from_array(z),
+            x: f32x4::new(x),
+            y: f32x4::new(y),
+            z: f32x4::new(z),
         }
     }
 
@@ -67,22 +67,22 @@ impl SoaVec3 {
     #[inline]
     pub const fn splat_col(v: [f32; 3]) -> SoaVec3 {
         SoaVec3 {
-            x: f32x4::from_array([v[0]; 4]),
-            y: f32x4::from_array([v[1]; 4]),
-            z: f32x4::from_array([v[2]; 4]),
+            x: f32x4::new([v[0]; 4]),
+            y: f32x4::new([v[1]; 4]),
+            z: f32x4::new([v[2]; 4]),
         }
     }
 
     #[inline]
     pub fn col(&self, idx: usize) -> Vec3 {
-        Vec3::new(self.x[idx], self.y[idx], self.z[idx])
+        Vec3::new(self.x.as_array_ref()[idx], self.y.as_array_ref()[idx], self.z.as_array_ref()[idx])
     }
 
     #[inline]
     pub fn set_col(&mut self, idx: usize, v: Vec3) {
-        self.x[idx] = v.x;
-        self.y[idx] = v.y;
-        self.z[idx] = v.z;
+        self.x.as_array_mut()[idx] = v.x;
+        self.y.as_array_mut()[idx] = v.y;
+        self.z.as_array_mut()[idx] = v.z;
     }
 
     #[inline]
@@ -209,9 +209,9 @@ const _: () = {
     impl Serialize for SoaVec3 {
         fn serialize<S: Serializer>(&self, serializer: S) -> Result<S::Ok, S::Error> {
             let mut seq = serializer.serialize_seq(Some(3))?;
-            seq.serialize_element(&self.x.as_array())?;
-            seq.serialize_element(&self.y.as_array())?;
-            seq.serialize_element(&self.z.as_array())?;
+            seq.serialize_element(&self.x.as_array_ref())?;
+            seq.serialize_element(&self.y.as_array_ref())?;
+            seq.serialize_element(&self.z.as_array_ref())?;
             seq.end()
         }
     }
@@ -242,10 +242,10 @@ impl SoaQuat {
     #[inline]
     pub const fn new(x: [f32; 4], y: [f32; 4], z: [f32; 4], w: [f32; 4]) -> SoaQuat {
         SoaQuat {
-            x: f32x4::from_array(x),
-            y: f32x4::from_array(y),
-            z: f32x4::from_array(z),
-            w: f32x4::from_array(w),
+            x: f32x4::new(x),
+            y: f32x4::new(y),
+            z: f32x4::new(z),
+            w: f32x4::new(w),
         }
     }
 
@@ -257,10 +257,10 @@ impl SoaQuat {
     #[inline]
     pub const fn splat_col(v: [f32; 4]) -> SoaQuat {
         SoaQuat {
-            x: f32x4::from_array([v[0]; 4]),
-            y: f32x4::from_array([v[1]; 4]),
-            z: f32x4::from_array([v[2]; 4]),
-            w: f32x4::from_array([v[3]; 4]),
+            x: f32x4::new([v[0]; 4]),
+            y: f32x4::new([v[1]; 4]),
+            z: f32x4::new([v[2]; 4]),
+            w: f32x4::new([v[3]; 4]),
         }
     }
 
@@ -574,10 +574,10 @@ impl AosMat4 {
     ) -> AosMat4 {
         AosMat4 {
             cols: [
-                f32x4::from_array([n00, n01, n02, n03]),
-                f32x4::from_array([n10, n11, n12, n13]),
-                f32x4::from_array([n20, n21, n22, n23]),
-                f32x4::from_array([n30, n31, n32, n33]),
+                f32x4::new([n00, n01, n02, n03]),
+                f32x4::new([n10, n11, n12, n13]),
+                f32x4::new([n20, n21, n22, n23]),
+                f32x4::new([n30, n31, n32, n33]),
             ],
         }
     }
@@ -586,10 +586,10 @@ impl AosMat4 {
     pub(crate) fn new_translation(t: Vec3) -> AosMat4 {
         AosMat4 {
             cols: [
-                f32x4::from_array([1.0, 0.0, 0.0, 0.0]),
-                f32x4::from_array([0.0, 1.0, 0.0, 0.0]),
-                f32x4::from_array([0.0, 0.0, 1.0, 0.0]),
-                f32x4::from_array([t.x, t.y, t.z, 1.0]),
+                f32x4::new([1.0, 0.0, 0.0, 0.0]),
+                f32x4::new([0.0, 1.0, 0.0, 0.0]),
+                f32x4::new([0.0, 0.0, 1.0, 0.0]),
+                f32x4::new([t.x, t.y, t.z, 1.0]),
             ],
         }
     }
@@ -598,10 +598,10 @@ impl AosMat4 {
     pub(crate) fn new_scaling(s: Vec3) -> AosMat4 {
         AosMat4 {
             cols: [
-                f32x4::from_array([s.x, 0.0, 0.0, 0.0]),
-                f32x4::from_array([0.0, s.y, 0.0, 0.0]),
-                f32x4::from_array([0.0, 0.0, s.z, 0.0]),
-                f32x4::from_array([0.0, 0.0, 0.0, 1.0]),
+                f32x4::new([s.x, 0.0, 0.0, 0.0]),
+                f32x4::new([0.0, s.y, 0.0, 0.0]),
+                f32x4::new([0.0, 0.0, s.z, 0.0]),
+                f32x4::new([0.0, 0.0, 0.0, 1.0]),
             ],
         }
     }
@@ -610,10 +610,10 @@ impl AosMat4 {
     pub(crate) fn identity() -> AosMat4 {
         AosMat4 {
             cols: [
-                f32x4::from_array([1.0, 0.0, 0.0, 0.0]),
-                f32x4::from_array([0.0, 1.0, 0.0, 0.0]),
-                f32x4::from_array([0.0, 0.0, 1.0, 0.0]),
-                f32x4::from_array([0.0, 0.0, 0.0, 1.0]),
+                f32x4::new([1.0, 0.0, 0.0, 0.0]),
+                f32x4::new([0.0, 1.0, 0.0, 0.0]),
+                f32x4::new([0.0, 0.0, 1.0, 0.0]),
+                f32x4::new([0.0, 0.0, 0.0, 1.0]),
             ],
         }
     }
@@ -865,10 +865,10 @@ pub(crate) fn f16_to_f32(n: u16) -> f32 {
 
 #[inline]
 pub(crate) fn simd_f16_to_f32(half4: [u16; 4]) -> f32x4 {
-    const MASK_NO_SIGN: i32x4 = i32x4::from_array([0x7FFF; 4]);
-    const MAGIC: f32x4 = fx4(i32x4::from_array([(254 - 15) << 23; 4]));
-    const WAS_INFNAN: i32x4 = i32x4::from_array([0x7BFF; 4]);
-    const EXP_INFNAN: i32x4 = i32x4::from_array([255 << 23; 4]);
+    const MASK_NO_SIGN: i32x4 = i32x4::new([0x7FFF; 4]);
+    const MAGIC: f32x4 = fx4(i32x4::new([(254 - 15) << 23; 4]));
+    const WAS_INFNAN: i32x4 = i32x4::new([0x7BFF; 4]);
+    const EXP_INFNAN: i32x4 = i32x4::new([255 << 23; 4]);
 
     let int4 = i32x4::from([half4[0] as i32, half4[1] as i32, half4[2] as i32, half4[3] as i32]);
     let expmant = MASK_NO_SIGN & int4;
@@ -1035,17 +1035,17 @@ pub(crate) fn fx4_sin_cos(v: f32x4) -> (f32x4, f32x4) {
     // Implementation based on Vec4.inl from the JoltPhysics
     // https://github.com/jrouwe/JoltPhysics/blob/master/Jolt/Math/Vec4.inl
 
-    const N1: f32x4 = f32x4::from_array([1.5703125; 4]);
-    const N2: f32x4 = f32x4::from_array([0.0004837512969970703125; 4]);
-    const N3: f32x4 = f32x4::from_array([7.549789948768648e-8; 4]);
+    const N1: f32x4 = f32x4::new([1.5703125; 4]);
+    const N2: f32x4 = f32x4::new([0.0004837512969970703125; 4]);
+    const N3: f32x4 = f32x4::new([7.549789948768648e-8; 4]);
 
-    const C1: f32x4 = f32x4::from_array([2.443315711809948e-5; 4]);
-    const C2: f32x4 = f32x4::from_array([1.388731625493765e-3; 4]);
-    const C3: f32x4 = f32x4::from_array([4.166664568298827e-2; 4]);
+    const C1: f32x4 = f32x4::new([2.443315711809948e-5; 4]);
+    const C2: f32x4 = f32x4::new([1.388731625493765e-3; 4]);
+    const C3: f32x4 = f32x4::new([4.166664568298827e-2; 4]);
 
-    const S1: f32x4 = f32x4::from_array([-1.9515295891e-4; 4]);
-    const S2: f32x4 = f32x4::from_array([8.3321608736e-3; 4]);
-    const S3: f32x4 = f32x4::from_array([1.6666654611e-1; 4]);
+    const S1: f32x4 = f32x4::new([-1.9515295891e-4; 4]);
+    const S2: f32x4 = f32x4::new([8.3321608736e-3; 4]);
+    const S3: f32x4 = f32x4::new([1.6666654611e-1; 4]);
 
     // Make argument positive and remember sign for sin only since cos is symmetric around x (highest bit of a float is the sign bit)
     let mut sin_sign = fx4_sign(v);
@@ -1129,11 +1129,11 @@ pub(crate) fn fx4_asin(v: f32x4) -> f32x4 {
     // Implementation based on Vec4.inl from the JoltPhysics
     // https://github.com/jrouwe/JoltPhysics/blob/master/Jolt/Math/Vec4.inl
 
-    const N1: f32x4 = f32x4::from_array([4.2163199048e-2; 4]);
-    const N2: f32x4 = f32x4::from_array([2.4181311049e-2; 4]);
-    const N3: f32x4 = f32x4::from_array([4.5470025998e-2; 4]);
-    const N4: f32x4 = f32x4::from_array([7.4953002686e-2; 4]);
-    const N5: f32x4 = f32x4::from_array([1.6666752422e-1; 4]);
+    const N1: f32x4 = f32x4::new([4.2163199048e-2; 4]);
+    const N2: f32x4 = f32x4::new([2.4181311049e-2; 4]);
+    const N3: f32x4 = f32x4::new([4.5470025998e-2; 4]);
+    const N4: f32x4 = f32x4::new([7.4953002686e-2; 4]);
+    const N5: f32x4 = f32x4::new([1.6666752422e-1; 4]);
 
     // Make argument positive
     let asin_sign = fx4_sign(v);
@@ -1167,7 +1167,7 @@ pub(crate) fn fx4_asin(v: f32x4) -> f32x4 {
 
 #[inline]
 pub(crate) fn fx4_acos(v: f32x4) -> f32x4 {
-    const FRAC_PI_2: f32x4 = f32x4::from_array([core::f32::consts::FRAC_PI_2; 4]);
+    const FRAC_PI_2: f32x4 = f32x4::new([core::f32::consts::FRAC_PI_2; 4]);
     FRAC_PI_2 - fx4_asin(v)
 }
 
@@ -1240,9 +1240,9 @@ pub(crate) fn quat_from_vectors(from: f32x4, to: f32x4) -> f32x4 {
     let quat;
     if real_part_x < 1.0e-6 * norm_from_norm_to_x {
         if from[0].abs() > from[2].abs() {
-            quat = f32x4::from_array([-from[1], from[0], 0.0, 0.0])
+            quat = f32x4::new([-from[1], from[0], 0.0, 0.0])
         } else {
-            quat = f32x4::from_array([0.0, -from[2], from[1], 0.0])
+            quat = f32x4::new([0.0, -from[2], from[1], 0.0])
         }
     } else {
         quat = fx4_set_w(vec3_cross(from, to), real_part)
@@ -1318,13 +1318,13 @@ mod tests {
             0b01111100_00000000,
         ];
         let float4 = simd_f16_to_f32(half4);
-        assert_eq!(float4, f32x4::from_array([1.0f32, -1.0f32, 3.5f32, f32::INFINITY]));
+        assert_eq!(float4, f32x4::new([1.0f32, -1.0f32, 3.5f32, f32::INFINITY]));
 
         let half4 = [0b11111100_00000000, 0, 0x8000, 32791];
         let float4 = simd_f16_to_f32(half4);
         assert_eq!(
             float4,
-            f32x4::from_array([f32::NEG_INFINITY, 0.0f32, 0.0f32, -1.37090683e-06])
+            f32x4::new([f32::NEG_INFINITY, 0.0f32, 0.0f32, -1.37090683e-06])
         );
 
         let half4 = [0xFFFF, 0, 0, 0];
@@ -1382,19 +1382,19 @@ mod tests {
     #[test]
     #[wasm_bindgen_test]
     fn test_sin_cos() {
-        const EPSILON: f32x4 = f32x4::from_array([2.0e-7; 4]);
+        const EPSILON: f32x4 = f32x4::new([2.0e-7; 4]);
 
-        let (sin, cos) = fx4_sin_cos(f32x4::from_array([
+        let (sin, cos) = fx4_sin_cos(f32x4::new([
             0.0,
             core::f32::consts::FRAC_PI_2,
             core::f32::consts::PI,
             -core::f32::consts::FRAC_PI_2,
         ]));
-        assert!((sin - f32x4::from_array([0.0, 1.0, 0.0, -1.0]))
+        assert!((sin - f32x4::new([0.0, 1.0, 0.0, -1.0]))
             .abs()
             .simd_lt(EPSILON)
             .all());
-        assert!((cos - f32x4::from_array([1.0, 0.0, -1.0, 0.0]))
+        assert!((cos - f32x4::new([1.0, 0.0, -1.0, 0.0]))
             .abs()
             .simd_lt(EPSILON)
             .all());
@@ -1404,7 +1404,7 @@ mod tests {
 
         let mut i = -100.0 * core::f32::consts::PI;
         while i < 100.0 * core::f32::consts::PI {
-            let iv = f32x4::splat(i) + f32x4::from_array([0.0e-4, 2.5e-4, 5.0e-4, 7.5e-4]);
+            let iv = f32x4::splat(i) + f32x4::new([0.0e-4, 2.5e-4, 5.0e-4, 7.5e-4]);
             let (sin, cos) = fx4_sin_cos(iv);
 
             for i in 0..4 {
@@ -1437,7 +1437,7 @@ mod tests {
 
         let mut i = -1.0;
         while i < 1.0 {
-            let iv = f32x4::splat(i) + f32x4::from_array([0.0e-4, 2.5e-4, 5.0e-4, 7.5e-4]).simd_min(f32x4::splat(1.0));
+            let iv = f32x4::splat(i) + f32x4::new([0.0e-4, 2.5e-4, 5.0e-4, 7.5e-4]).simd_min(f32x4::splat(1.0));
             let asin = fx4_asin(iv);
 
             for i in 0..4 {
@@ -1465,7 +1465,7 @@ mod tests {
 
         let mut i = -1.0;
         while i < 1.0 {
-            let iv = f32x4::splat(i) + f32x4::from_array([0.0e-4, 2.5e-4, 5.0e-4, 7.5e-4]).simd_min(f32x4::splat(1.0));
+            let iv = f32x4::splat(i) + f32x4::new([0.0e-4, 2.5e-4, 5.0e-4, 7.5e-4]).simd_min(f32x4::splat(1.0));
             let acos = fx4_acos(iv);
 
             for i in 0..4 {
diff --git a/src/sampling_job.rs b/src/sampling_job.rs
index ac882fa..6119325 100644
--- a/src/sampling_job.rs
+++ b/src/sampling_job.rs
@@ -6,10 +6,11 @@ use std::alloc::{self, Layout};
 use std::cell::RefCell;
 use std::fmt::{Debug, Formatter};
 use std::rc::Rc;
-use std::simd::prelude::*;
 use std::sync::{Arc, RwLock};
 use std::{mem, ptr, slice};
 
+use wide::f32x4;
+
 use crate::animation::{Animation, Float3Key, KeyframesCtrl, QuaternionKey};
 use crate::base::{align_ptr, align_usize, OzzError, OzzMutBuf, OzzObj};
 use crate::math::{f32_clamp_or_max, SoaQuat, SoaTransform, SoaVec3};
@@ -124,18 +125,18 @@ const _: () = {
 mod serde_interp {
     use serde::ser::SerializeSeq;
     use serde::{Deserialize, Deserializer, Serializer};
-    use std::simd::prelude::*;
+    use wide::f32x4;
 
     pub(crate) fn serialize<S: Serializer>(value: &[f32x4; 2], serializer: S) -> Result<S::Ok, S::Error> {
         let mut seq = serializer.serialize_seq(Some(2))?;
-        seq.serialize_element(value[0].as_array())?;
-        seq.serialize_element(value[1].as_array())?;
+        seq.serialize_element(value[0].as_array_ref())?;
+        seq.serialize_element(value[1].as_array_ref())?;
         seq.end()
     }
 
     pub(crate) fn deserialize<'de, D: Deserializer<'de>>(deserializer: D) -> Result<[f32x4; 2], D::Error> {
         let tmp: [[f32; 4]; 2] = Deserialize::deserialize(deserializer)?;
-        Ok([f32x4::from_array(tmp[0]), f32x4::from_array(tmp[1])])
+        Ok([f32x4::new(tmp[0]), f32x4::new(tmp[1])])
     }
 }
 
@@ -1283,7 +1284,7 @@ where
 
     #[inline(always)]
     fn key_ratio_simd(ctrl: &KeyframesCtrl<'_>, timepoints: &[f32], ats: &[u32]) -> f32x4 {
-        f32x4::from_array([
+        f32x4::new([
             timepoints[ctrl.ratios[ats[0] as usize] as usize],
             timepoints[ctrl.ratios[ats[1] as usize] as usize],
             timepoints[ctrl.ratios[ats[2] as usize] as usize],
diff --git a/src/skeleton.rs b/src/skeleton.rs
index 3fc947e..13a0664 100644
--- a/src/skeleton.rs
+++ b/src/skeleton.rs
@@ -409,8 +409,8 @@ const _: () = {
 
 #[cfg(test)]
 mod tests {
-    use std::simd::prelude::*;
     use wasm_bindgen_test::*;
+    use wide::f32x4;
 
     use super::*;
     use crate::math::{SoaQuat, SoaVec3};
@@ -425,53 +425,53 @@ mod tests {
         assert_eq!(
             skeleton.joint_rest_poses()[0].translation,
             SoaVec3 {
-                x: f32x4::from_array([-4.01047945e-10, 0.00000000, 0.0710870326, 0.110522307]),
-                y: f32x4::from_array([1.04666960, 0.00000000, -8.79573781e-05, -7.82728166e-05]),
-                z: f32x4::from_array([-0.0151103791, 0.00000000, 9.85883801e-08, -2.17094467e-10]),
+                x: f32x4::new([-4.01047945e-10, 0.00000000, 0.0710870326, 0.110522307]),
+                y: f32x4::new([1.04666960, 0.00000000, -8.79573781e-05, -7.82728166e-05]),
+                z: f32x4::new([-0.0151103791, 0.00000000, 9.85883801e-08, -2.17094467e-10]),
             },
         );
         assert_eq!(
             skeleton.joint_rest_poses()[16].translation,
             SoaVec3 {
-                x: f32x4::from_array([0.458143145, 0.117970668, 0.0849116519, 0.00000000]),
-                y: f32x4::from_array([2.64545919e-09, 0.148304969, 0.00000000, 0.00000000]),
-                z: f32x4::from_array([-4.97557555e-14, -7.47846236e-15, -1.77635680e-17, 0.00000000]),
+                x: f32x4::new([0.458143145, 0.117970668, 0.0849116519, 0.00000000]),
+                y: f32x4::new([2.64545919e-09, 0.148304969, 0.00000000, 0.00000000]),
+                z: f32x4::new([-4.97557555e-14, -7.47846236e-15, -1.77635680e-17, 0.00000000]),
             }
         );
 
         assert_eq!(
             skeleton.joint_rest_poses()[0].rotation,
             SoaQuat {
-                x: f32x4::from_array([-0.500000000, -0.499999702, -1.41468570e-06, -3.05311332e-14]),
-                y: f32x4::from_array([-0.500000000, -0.500000358, -6.93941161e-07, 1.70812796e-22]),
-                z: f32x4::from_array([-0.500000000, -0.499999702, 0.000398159056, 1.08420217e-19]),
-                w: f32x4::from_array([0.500000000, 0.500000358, 1.00000000, 1.00000000]),
+                x: f32x4::new([-0.500000000, -0.499999702, -1.41468570e-06, -3.05311332e-14]),
+                y: f32x4::new([-0.500000000, -0.500000358, -6.93941161e-07, 1.70812796e-22]),
+                z: f32x4::new([-0.500000000, -0.499999702, 0.000398159056, 1.08420217e-19]),
+                w: f32x4::new([0.500000000, 0.500000358, 1.00000000, 1.00000000]),
             },
         );
         assert_eq!(
             skeleton.joint_rest_poses()[16].rotation,
             SoaQuat {
-                x: f32x4::from_array([-2.20410801e-09, 4.11812209e-07, -6.55128745e-32, 0.00000000]),
-                y: f32x4::from_array([4.60687737e-08, -4.11812152e-07, -1.30968591e-21, 0.00000000]),
-                z: f32x4::from_array([0.0498105064, 0.707106829, -2.46519033e-32, 0.00000000]),
-                w: f32x4::from_array([0.998758733, 0.707106769, 1.00000000, 1.00000000]),
+                x: f32x4::new([-2.20410801e-09, 4.11812209e-07, -6.55128745e-32, 0.00000000]),
+                y: f32x4::new([4.60687737e-08, -4.11812152e-07, -1.30968591e-21, 0.00000000]),
+                z: f32x4::new([0.0498105064, 0.707106829, -2.46519033e-32, 0.00000000]),
+                w: f32x4::new([0.998758733, 0.707106769, 1.00000000, 1.00000000]),
             }
         );
 
         assert_eq!(
             skeleton.joint_rest_poses()[0].scale,
             SoaVec3 {
-                x: f32x4::from_array([1.0, 1.0, 1.0, 1.0]),
-                y: f32x4::from_array([1.0, 1.0, 1.0, 1.0]),
-                z: f32x4::from_array([1.0, 1.0, 1.0, 1.0]),
+                x: f32x4::new([1.0, 1.0, 1.0, 1.0]),
+                y: f32x4::new([1.0, 1.0, 1.0, 1.0]),
+                z: f32x4::new([1.0, 1.0, 1.0, 1.0]),
             },
         );
         assert_eq!(
             skeleton.joint_rest_poses()[16].scale,
             SoaVec3 {
-                x: f32x4::from_array([0.999999940, 1.0, 1.0, 1.0]),
-                y: f32x4::from_array([0.999999940, 1.0, 1.0, 1.0]),
-                z: f32x4::from_array([1.0, 1.0, 1.0, 1.0]),
+                x: f32x4::new([0.999999940, 1.0, 1.0, 1.0]),
+                y: f32x4::new([0.999999940, 1.0, 1.0, 1.0]),
+                z: f32x4::new([1.0, 1.0, 1.0, 1.0]),
             }
         );