Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP port to wide from std-simd #101

Draft
wants to merge 1 commit into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,22 @@ serde = ["dep:serde", "glam/serde", "bimap/serde" ]
rkyv = ["dep:rkyv", "dep:bytecheck", "glam/rkyv", "glam/bytecheck"]
wasm = []
nodejs = ["wasm", "dep:js-sys", "dep:wasm-bindgen"]
archive_le = []
archive_be = []

[dependencies]
bimap = { version = "0.6" }
bytecheck = { version = "0.6", optional = true, default-features = false }
glam = { version = "0.27", features = [ "core-simd", "libm" ] }
glam = { version = "0.27", features = [ "libm" ] }
# glam = { version = "0.27", features = [ "core-simd", "libm" ] }
js-sys = { version = "0.3", optional = true }
rkyv = { version = "0.7", optional = true, features = [ "validation" ] }
serde = { version= "1.0", optional = true, features = [ "serde_derive" ] }
static_assertions = "1.1"
thiserror = "1.0"
wasm-bindgen = { version = "0.2", optional = true }
# wide = "0.7.30"
wide = { git = "https://github.com/Lokathor/wide.git", rev = "0f15f92bff68e36bf60a7305d9201873dd26d6b9" }

[dev-dependencies]
getrandom = { version = "0.2", features = ["js"] }
Expand Down
2 changes: 1 addition & 1 deletion rust-toolchain
Original file line number Diff line number Diff line change
@@ -1 +1 @@
nightly
1.83
41 changes: 20 additions & 21 deletions src/animation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,8 @@
use glam::{Quat, Vec3, Vec4};
use std::alloc::{self, Layout};
use std::io::Read;
use std::simd::prelude::*;
use std::simd::*;
use std::{mem, slice};
use wide::{f32x4, i32x4};

use crate::archive::{Archive, ArchiveRead};
use crate::base::{align_ptr, align_usize, OzzError};
Expand Down Expand Up @@ -102,15 +101,15 @@ impl QuaternionKey {
k3: &QuaternionKey,
soa: &mut SoaQuat,
) {
const MASK_F000:i32x4 = i32x4::from_array([-1i32, 0, 0, 0]);
const MASK_0F00:i32x4 = i32x4::from_array([0, -1i32, 0, 0]);
const MASK_00F0:i32x4 = i32x4::from_array([0, 0, -1i32, 0]);
const MASK_000F:i32x4 = i32x4::from_array([0, 0, 0, -1i32]);
const MASK_F000:i32x4 = i32x4::new([-1i32, 0, 0, 0]);
const MASK_0F00:i32x4 = i32x4::new([0, -1i32, 0, 0]);
const MASK_00F0:i32x4 = i32x4::new([0, 0, -1i32, 0]);
const MASK_000F:i32x4 = i32x4::new([0, 0, 0, -1i32]);

const MAPPING: [[usize; 4]; 4] = [[0, 0, 1, 2], [0, 0, 1, 2], [0, 1, 0, 2], [0, 1, 2, 0]];

const SCALE: f32x4 = f32x4::from_array([core::f32::consts::SQRT_2 / 32767.0; 4]);
const OFFSET: f32x4 = f32x4::from_array([-core::f32::consts::SQRT_2 / 2.0; 4]);
const SCALE: f32x4 = f32x4::new([core::f32::consts::SQRT_2 / 32767.0; 4]);
const OFFSET: f32x4 = f32x4::new([-core::f32::consts::SQRT_2 / 2.0; 4]);

let (largest0, sign0, value0) = k0.unpack();
let (largest1, sign1, value1) = k1.unpack();
Expand All @@ -123,10 +122,10 @@ impl QuaternionKey {
let m3 = &MAPPING[largest3 as usize];

let cmp_keys: [f32x4; 4] = [
f32x4::from_array([ value0[m0[0]] as f32, value1[m1[0]] as f32, value2[m2[0]] as f32, value3[m3[0]] as f32 ]),
f32x4::from_array([ value0[m0[1]] as f32, value1[m1[1]] as f32, value2[m2[1]] as f32, value3[m3[1]] as f32 ]),
f32x4::from_array([ value0[m0[2]] as f32, value1[m1[2]] as f32, value2[m2[2]] as f32, value3[m3[2]] as f32 ]),
f32x4::from_array([ value0[m0[3]] as f32, value1[m1[3]] as f32, value2[m2[3]] as f32, value3[m3[3]] as f32 ]),
f32x4::new([ value0[m0[0]] as f32, value1[m1[0]] as f32, value2[m2[0]] as f32, value3[m3[0]] as f32 ]),
f32x4::new([ value0[m0[1]] as f32, value1[m1[1]] as f32, value2[m2[1]] as f32, value3[m3[1]] as f32 ]),
f32x4::new([ value0[m0[2]] as f32, value1[m1[2]] as f32, value2[m2[2]] as f32, value3[m3[2]] as f32 ]),
f32x4::new([ value0[m0[3]] as f32, value1[m1[3]] as f32, value2[m2[3]] as f32, value3[m3[3]] as f32 ]),
]; // TODO: simd int to float

let mut cpnt = [
Expand All @@ -141,9 +140,9 @@ impl QuaternionKey {
cpnt[largest3 as usize] = fx4(ix4(cpnt[largest3 as usize]) & !MASK_000F);

let dot = cpnt[0] * cpnt[0] + cpnt[1] * cpnt[1] + cpnt[2] * cpnt[2] + cpnt[3] * cpnt[3];
let ww0 = f32x4::simd_max(ZERO, ONE - dot); // prevent NaN, different from C++ code
let ww0 = f32x4::fast_max(ZERO, ONE - dot); // prevent NaN, different from C++ code
let w0 = ww0.sqrt();
let sign = i32x4::from_array([sign0 as i32, sign1 as i32, sign2 as i32, sign3 as i32]) << 31;
let sign = i32x4::new([sign0 as i32, sign1 as i32, sign2 as i32, sign3 as i32]) << 31;
let restored = ix4(w0) | sign;

cpnt[largest0 as usize] = fx4(ix4(cpnt[largest0 as usize]) | (restored & MASK_F000));
Expand Down Expand Up @@ -1116,14 +1115,14 @@ mod tests {
assert_eq!(
soa,
SoaVec3 {
x: f32x4::from_array([0.0711059570, 0.0251312255859375, 0.0711059570, 0.0251312255859375]),
y: f32x4::from_array([
x: f32x4::new([0.0711059570, 0.0251312255859375, 0.0711059570, 0.0251312255859375]),
y: f32x4::new([
-8.77380371e-05,
5.960464477539063e-8,
-8.77380371e-05,
5.960464477539063e-8
]),
z: f32x4::from_array([1.84774399e-06, 0.0, 1.84774399e-06, 0.0]),
z: f32x4::new([1.84774399e-06, 0.0, 1.84774399e-06, 0.0]),
}
);
}
Expand Down Expand Up @@ -1172,10 +1171,10 @@ mod tests {
assert_eq!(
soa,
SoaQuat {
x: f32x4::from_array([-0.491480947, -0.498861253, -0.00912827253, 0.00852406025]),
y: f32x4::from_array([-0.508615375, -0.501123607, 0.0251405239, 0.00882613659]),
z: f32x4::from_array([-0.538519204, -0.498861253, -0.0326502919, 0.00610709190]),
w: f32x4::from_array([0.457989037, 0.501148760, 0.999108911, 0.999906063]),
x: f32x4::new([-0.491480947, -0.498861253, -0.00912827253, 0.00852406025]),
y: f32x4::new([-0.508615375, -0.501123607, 0.0251405239, 0.00882613659]),
z: f32x4::new([-0.538519204, -0.498861253, -0.0326502919, 0.00610709190]),
w: f32x4::new([0.457989037, 0.501148760, 0.999108911, 0.999906063]),
}
);
}
Expand Down
19 changes: 8 additions & 11 deletions src/blending_job.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,15 @@
//!

use glam::Vec4;
use wide::f32x4;
use std::cell::RefCell;
use std::rc::Rc;
use std::simd::prelude::*;
use std::sync::{Arc, RwLock};

use crate::base::{OzzBuf, OzzError, OzzMutBuf, OzzObj};
use crate::math::{fx4_from_vec4, fx4_sign, SoaQuat, SoaTransform, SoaVec3};
use crate::math::{fx4_from_vec4, fx4_sign, SoaQuat, SoaTransform, SoaVec3, ONE, ZERO};
use crate::skeleton::Skeleton;

const ZERO: f32x4 = f32x4::from_array([0.0; 4]);
const ONE: f32x4 = f32x4::from_array([1.0; 4]);

/// Defines a layer of blending input data (local space transforms) and parameters (weights).
#[derive(Debug, Clone)]
pub struct BlendingLayer<I: OzzBuf<SoaTransform>> {
Expand Down Expand Up @@ -352,13 +349,13 @@ where

if ctx.num_passes == 0 {
for idx in 0..num_soa_joints {
let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
ctx.accumulated_weights[idx] = weight;
Self::blend_1st_pass(&transform[idx], weight, &mut output[idx]);
}
} else {
for idx in 0..num_soa_joints {
let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
ctx.accumulated_weights[idx] += weight;
Self::blend_n_pass(&transform[idx], weight, &mut output[idx]);
}
Expand Down Expand Up @@ -402,8 +399,8 @@ where
} else {
let simd_threshold = f32x4::splat(threshold);
for idx in 0..joint_rest_poses.len() {
let bp_weight = (simd_threshold - ctx.accumulated_weights[idx]).simd_max(ZERO);
ctx.accumulated_weights[idx] = simd_threshold.simd_max(ctx.accumulated_weights[idx]);
let bp_weight = (simd_threshold - ctx.accumulated_weights[idx]).fast_max(ZERO);
ctx.accumulated_weights[idx] = simd_threshold.fast_max(ctx.accumulated_weights[idx]);
Self::blend_n_pass(&joint_rest_poses[idx], bp_weight, &mut output[idx]);
}
}
Expand Down Expand Up @@ -450,7 +447,7 @@ where

if !layer.joint_weights.is_empty() {
for idx in 0..joint_rest_poses.len() {
let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
let one_minus_weight = ONE - weight;
Self::blend_add_pass(&transform[idx], weight, one_minus_weight, &mut output[idx]);
}
Expand All @@ -465,7 +462,7 @@ where

if !layer.joint_weights.is_empty() {
for idx in 0..joint_rest_poses.len() {
let weight = layer_weight * layer.joint_weight(idx).simd_max(ZERO);
let weight = layer_weight * layer.joint_weight(idx).fast_max(ZERO);
let one_minus_weight = ONE - weight;
Self::blend_sub_pass(&transform[idx], weight, one_minus_weight, &mut output[idx]);
}
Expand Down
13 changes: 6 additions & 7 deletions src/ik_aim_job.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,7 @@
//!

use glam::{Mat4, Quat, Vec3A};
use std::simd::prelude::*;
use std::simd::StdFloat;
use wide::{f32x4, CmpEq, CmpGt, CmpNe};

use crate::base::OzzError;
use crate::math::*;
Expand Down Expand Up @@ -234,7 +233,7 @@ impl IKAimJob {

let offsetted_forward = Self::compute_offsetted_forward(self.forward, self.offset, joint_to_target_js);
self.reached = offsetted_forward.is_some();
if !self.reached || (joint_to_target_js_len2.simd_eq(ZERO).to_bitmask() & 0x1 == 0x1) {
if !self.reached || (joint_to_target_js_len2.cmp_eq(ZERO).to_bitmask() & 0x1 == 0x1) {
self.joint_correction = QUAT_UNIT;
return Ok(());
}
Expand All @@ -256,7 +255,7 @@ impl IKAimJob {

let rotate_plane_axis_js;
let rotate_plane_js;
if denoms.simd_ne(ZERO).to_bitmask() & 0x7 == 0x7 {
if denoms.cmp_ne(ZERO).to_bitmask() & 0x7 == 0x7 {
let rsqrts = denoms.sqrt().recip();
rotate_plane_axis_js = joint_to_target_js * fx4_splat_x(rsqrts);

Expand All @@ -268,7 +267,7 @@ impl IKAimJob {
let rotate_plane_axis_flipped_js = fx4_xor(rotate_plane_axis_js, axis_flip);
rotate_plane_js = quat_from_cos_angle(
rotate_plane_axis_flipped_js,
rotate_plane_cos_angle.simd_clamp(NEG_ONE, ONE),
rotate_plane_cos_angle.fast_max(NEG_ONE).fast_min(ONE), // clamp elements between -1.0 and 1.0
);
} else {
rotate_plane_axis_js = joint_to_target_js * fx4_splat_x(denoms.sqrt().recip());
Expand All @@ -284,7 +283,7 @@ impl IKAimJob {

let twisted_fu = quat_positive_w(twisted);
if self.weight < 1.0 {
let simd_weight = f32x4::splat(self.weight).simd_max(ZERO);
let simd_weight = f32x4::splat(self.weight).fast_max(ZERO);
self.joint_correction = quat_normalize(fx4_lerp(QUAT_UNIT, twisted_fu, simd_weight));
} else {
self.joint_correction = twisted_fu;
Expand All @@ -296,7 +295,7 @@ impl IKAimJob {
let ao_l = vec3_dot_s(forward, offset);
let ac_l2 = vec3_length2_s(offset) - ao_l * ao_l;
let r2 = vec3_length2_s(target);
if ac_l2.simd_gt(r2).to_bitmask() & 0x1 == 0x1 {
if ac_l2.cmp_gt(r2).to_bitmask() & 0x1 == 0x1 {
return None;
}
let ai_l = (r2 - ac_l2).sqrt();
Expand Down
21 changes: 11 additions & 10 deletions src/ik_two_bone_job.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,9 @@
//!

use glam::{Mat4, Quat, Vec3A};
use std::simd::prelude::*;
use std::simd::StdFloat;
// use std::simd::prelude::*;
// use std::simd::StdFloat;
use wide::{f32x4, CmpGt};

use crate::base::OzzError;
use crate::math::*;
Expand Down Expand Up @@ -339,12 +340,12 @@ impl IKTwoBoneJob {
let start_target_original_ss_len = fx4_splat_z(lengths); // [x y z w]
let bone_len_diff_abs = (start_mid_ss_len - mid_end_ss_len).abs(); // [x]
let bones_chain_len = start_mid_ss_len + mid_end_ss_len; // [x]
let da = bones_chain_len * fx4_clamp_or_min(f32x4::from_array([self.soften, 0.0, 0.0, 0.0]), ZERO, ONE); // [x 0 0 0] da.yzw needs to be 0
let da = bones_chain_len * fx4_clamp_or_min(f32x4::new([self.soften, 0.0, 0.0, 0.0]), ZERO, ONE); // [x 0 0 0] da.yzw needs to be 0
let ds = bones_chain_len - da; // [x]

let left = fx4_set_w(start_target_original_ss_len, ds); // [x y z w]
let right = fx4_set_z(da, bone_len_diff_abs); // [x y z w]
let comp_mask = left.simd_gt(right).to_bitmask();
let comp_mask = left.cmp_gt(right).to_bitmask();

let start_target_ss;
let start_target_ss_len2;
Expand Down Expand Up @@ -410,7 +411,7 @@ impl IKTwoBoneJob {

let mut start_rot_ss = end_to_target_rot_ss;

if start_target_ss_len2.simd_gt(ZERO).to_bitmask() & 0x1 == 0x1 {
if start_target_ss_len2.cmp_gt(ZERO).to_bitmask() & 0x1 == 0x1 {
// [x]
let ref_plane_normal_ss = vec3_cross(start_target_ss, pole_ss); // [x y z]
let ref_plane_normal_ss_len2 = vec3_length2_s(ref_plane_normal_ss); // [x]
Expand Down Expand Up @@ -439,7 +440,7 @@ impl IKTwoBoneJob {

let rotate_plane_ss = quat_from_cos_angle(
rotate_plane_axis_flipped_ss,
rotate_plane_cos_angle.simd_clamp(NEG_ONE, ONE),
rotate_plane_cos_angle.fast_max(NEG_ONE).fast_min(ONE), // clamp elements between -1.0 and 1.0
);

if self.twist_angle != 0.0 {
Expand All @@ -457,14 +458,14 @@ impl IKTwoBoneJob {
let mid_rot_fu = quat_positive_w(mid_rot);

if self.weight < 1.0 {
let simd_weight = f32x4::splat(self.weight).simd_max(ZERO);
let simd_weight = f32x4::splat(self.weight).fast_max(ZERO);

let start_lerp = fx4_lerp(QUAT_UNIT, start_rot_fu, simd_weight);
let mid_lerp = fx4_lerp(QUAT_UNIT, mid_rot_fu, simd_weight);

let rsqrts = f32x4::from_array([
(start_lerp * start_lerp).reduce_sum(),
(mid_lerp * mid_lerp).reduce_sum(),
let rsqrts = f32x4::new([
(start_lerp * start_lerp).reduce_add(),
(mid_lerp * mid_lerp).reduce_add(),
0.0,
0.0,
])
Expand Down
2 changes: 0 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,6 @@
//! ```
//!

#![feature(portable_simd)]

pub mod animation;
pub mod archive;
pub mod base;
Expand Down
Loading