diff --git a/sse2neon.h b/sse2neon.h
index 060aad69..4cfb72b6 100644
--- a/sse2neon.h
+++ b/sse2neon.h
@@ -130,6 +130,20 @@
 
 #include <stdint.h>
 #include <stdlib.h>
+#include <string.h>
+
+FORCE_INLINE double recast_f64(uint64_t u64)
+{
+    double f64;
+    memcpy(&f64, &u64, sizeof(uint64_t));
+    return f64;
+}
+FORCE_INLINE int64_t recast_i64(double f64)
+{
+    int64_t i64;
+    memcpy(&i64, &f64, sizeof(uint64_t));
+    return i64;
+}
 
 #if defined(_WIN32)
 /* Definitions for _mm_{malloc,free} are provided by <malloc.h>
@@ -566,17 +580,6 @@ typedef union ALIGN_STRUCT(16) SIMDVec {
 #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
 #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
 
-typedef union bit64_union_t {
-    double f64;
-    int64_t i64;
-    uint64_t u64;
-} bit64_union_t;
-typedef union bit32_union_t {
-    float f32;
-    int32_t i32;
-    uint32_t u32;
-} bit32_union_t;
-
 // Function declaration
 // SSE
 FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
@@ -2992,14 +2995,13 @@ FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 + b0.f64;
-    c[1] = a1.f64 + b1.f64;
+    c[0] = a0 + b0;
+    c[1] = a1 + b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3013,13 +3015,13 @@ FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_add_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, a1, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     double c[2];
-    c[0] = a0.f64 + b0.f64;
-    c[1] = a1.f64;
+    c[0] = a0 + b0;
+    c[1] = a1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -3273,14 +3275,13 @@ FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 >= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 >= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3296,13 +3297,13 @@ FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpge_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 >= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 >= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3344,14 +3345,13 @@ FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 > b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 > b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3367,13 +3367,13 @@ FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 > b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 > b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3388,14 +3388,13 @@ FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 <= b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 <= b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3411,13 +3410,13 @@ FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
     return _mm_move_sd(a, _mm_cmple_pd(a, b));
 #else
     // expand "_mm_cmpge_pd()" to reduce unnecessary operations
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 <= b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 <= b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3462,14 +3461,13 @@ FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_u64(
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.f64 < b1.f64 ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1 < b1 ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3484,13 +3482,13 @@ FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmplt_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = a0.f64 < b0.f64 ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = a0 < b0 ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3532,14 +3530,13 @@ FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
         vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 >= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 >= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 >= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 >= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3564,14 +3561,13 @@ FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
         vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 > b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 > b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 > b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 > b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3596,14 +3592,13 @@ FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
         vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 <= b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 <= b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 <= b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 <= b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3628,14 +3623,13 @@ FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
         vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
         vdupq_n_u64(UINT64_MAX)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = !(a0.f64 < b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = !(a1.f64 < b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = !(a0 < b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = !(a1 < b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3663,14 +3657,13 @@ FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
         vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
     return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? ~UINT64_C(0) : UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? ~UINT64_C(0) : UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3685,13 +3678,13 @@ FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpord_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? ~UINT64_C(0) : UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = (a0 == a0 && b0 == b0) ? ~UINT64_C(0) : UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3711,14 +3704,13 @@ FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_s32(
         vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0);
-    d[1] = (a1.f64 == a1.f64 && b1.f64 == b1.f64) ? UINT64_C(0) : ~UINT64_C(0);
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = (a1 == a1 && b1 == b1) ? UINT64_C(0) : ~UINT64_C(0);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3733,13 +3725,13 @@ FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    uint64_t a1 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     uint64_t d[2];
-    d[0] = (a0.f64 == a0.f64 && b0.f64 == b0.f64) ? UINT64_C(0) : ~UINT64_C(0);
-    d[1] = a1.u64;
+    d[0] = (a0 == a0 && b0 == b0) ? UINT64_C(0) : ~UINT64_C(0);
+    d[1] = a1;
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -3753,10 +3745,10 @@ FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    return a0.f64 >= b0.f64;
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return a0 >= b0;
 #endif
 }
 
@@ -3768,11 +3760,11 @@ FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (a0.f64 > b0.f64);
+    return (a0 > b0);
 #endif
 }
 
@@ -3784,11 +3776,11 @@ FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (a0.f64 <= b0.f64);
+    return (a0 <= b0);
 #endif
 }
 
@@ -3800,11 +3792,11 @@ FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
 #else
-    bit64_union_t a0, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double a0, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
 
-    return (a0.f64 < b0.f64);
+    return (a0 < b0);
 #endif
 }
 
@@ -3873,10 +3865,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
         vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t d0, d1;
-    d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1);
-    return _mm_set_epi32(0, 0, (int32_t) d1.f64, (int32_t) d0.f64);
+    double d0, d1;
+    d0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
 #endif
 }
 
@@ -3886,10 +3878,10 @@ FORCE_INLINE_OPTNONE __m128i _mm_cvtpd_epi32(__m128d a)
 FORCE_INLINE_OPTNONE __m64 _mm_cvtpd_pi32(__m128d a)
 {
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t d0, d1;
-    d0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    d1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1);
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0.f64, (int32_t) d1.f64};
+    double d0, d1;
+    d0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    d1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
 
@@ -3903,10 +3895,10 @@ FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
     float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
     return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_ps(0, 0, (float) a1.f64, (float) a0.f64);
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_ps(0, 0, (float) a1, (float) a0);
 #endif
 }
 
@@ -4005,9 +3997,8 @@ FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
 #else
-    bit64_union_t _a;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    return _a.f64;
+    double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return _a;
 #endif
 }
 
@@ -4020,9 +4011,8 @@ FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
     return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t ret;
-    ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    return (int32_t) ret.f64;
+    double ret = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int32_t) ret;
 #endif
 }
 
@@ -4035,9 +4025,8 @@ FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
     return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
     __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
-    bit64_union_t ret;
-    ret.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0);
-    return (int64_t) ret.f64;
+    double ret = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(rnd), 0));
+    return (int64_t) ret;
 #endif
 }
 
@@ -4058,10 +4047,9 @@ FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
         vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
         vreinterpretq_f32_m128(a), 0));
 #else
-    bit64_union_t b0;
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
     return vreinterpretq_m128_f32(
-        vsetq_lane_f32((float) b0.f64, vreinterpretq_f32_m128(a), 0));
+        vsetq_lane_f32((float) b0, vreinterpretq_f32_m128(a), 0));
 #endif
 }
 
@@ -4093,10 +4081,9 @@ FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    bit64_union_t bf;
-    bf.f64 = (double) b;
+    int64_t _b = recast_i64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(_b, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4122,10 +4109,9 @@ FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
     return vreinterpretq_m128d_f64(
         vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
 #else
-    bit64_union_t bf;
-    bf.f64 = (double) b;
+    int64_t bf = recast_i64((double) b);
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(bf.i64, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(bf, vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4155,14 +4141,13 @@ FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
 FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 {
-    bit64_union_t d;
-    d.f64 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
+    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(
-        vsetq_lane_f64(d.f64, vreinterpretq_f64_m128d(a), 0));
+        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
 #else
     return vreinterpretq_m128d_s64(
-        vsetq_lane_s64(d.i64, vreinterpretq_s64_m128d(a), 0));
+        vsetq_lane_s64(recast_i64(d), vreinterpretq_s64_m128d(a), 0));
 #endif
 }
 
@@ -4171,10 +4156,10 @@ FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 {
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_epi32(0, 0, (int32_t) a1.f64, (int32_t) a0.f64);
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
 }
 
 // Convert packed double-precision (64-bit) floating-point elements in a to
@@ -4182,10 +4167,10 @@ FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
 {
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0.f64, (int32_t) a1.f64};
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
     return vreinterpret_m64_s32(vld1_s32(data));
 }
 
@@ -4202,9 +4187,8 @@ FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
 // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
 {
-    bit64_union_t _a;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    return (int32_t) _a.f64;
+    double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int32_t) _a;
 }
 
 // Convert the lower double-precision (64-bit) floating-point element in a to a
@@ -4215,9 +4199,8 @@ FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
 #else
-    bit64_union_t _a;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    return (int64_t) _a.f64;
+    double _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    return (int64_t) _a;
 #endif
 }
 
@@ -4235,14 +4218,13 @@ FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 / b0.f64;
-    c[1] = a1.f64 / b1.f64;
+    c[0] = a0 / b0;
+    c[1] = a1 / b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -4487,14 +4469,13 @@ FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
         vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 > b0.f64 ? a0.u64 : b0.u64;
-    d[1] = a1.f64 > b1.f64 ? a1.u64 : b1.u64;
+    d[0] = a0 > b0 ? recast_i64(a0) : recast_i64(b0);
+    d[1] = a1 > b1 ? recast_i64(a1) : recast_i64(b1);
 
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
@@ -4509,11 +4490,11 @@ FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_max_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    double c[2] = {a0.f64 > b0.f64 ? a0.f64 : b0.f64, a1.f64};
+    double a0, a1, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 > b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4551,14 +4532,13 @@ FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
         vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #endif
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     uint64_t d[2];
-    d[0] = a0.f64 < b0.f64 ? a0.u64 : b0.u64;
-    d[1] = a1.f64 < b1.f64 ? a1.u64 : b1.u64;
+    d[0] = a0 < b0 ? recast_i64(a0) : recast_i64(b0);
+    d[1] = a1 < b1 ? recast_i64(a1) : recast_i64(b1);
     return vreinterpretq_m128d_u64(vld1q_u64(d));
 #endif
 }
@@ -4572,11 +4552,11 @@ FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_min_pd(a, b));
 #else
-    bit64_union_t a0, a1, b0;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    double c[2] = {a0.f64 < b0.f64 ? a0.f64 : b0.f64, a1.f64};
+    double a0, a1, b0;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double c[2] = {a0 < b0 ? a0 : b0, a1};
     return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
 #endif
 }
@@ -4731,14 +4711,13 @@ FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 * b0.f64;
-    c[1] = a1.f64 * b1.f64;
+    c[0] = a0 * b0;
+    c[1] = a1 * b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -5028,9 +5007,8 @@ FORCE_INLINE __m128d _mm_set1_pd(double d)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vdupq_n_f64(d));
 #else
-    bit64_union_t _d;
-    _d.f64 = d;
-    return vreinterpretq_m128d_s64(vdupq_n_s64(_d.i64));
+    int64_t _d = recast_i64(d);
+    return vreinterpretq_m128d_s64(vdupq_n_s64(_d));
 #endif
 }
 
@@ -5321,11 +5299,11 @@ FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    double _a0 = sqrt(a0.f64);
-    double _a1 = sqrt(a1.f64);
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double _a0 = sqrt(a0);
+    double _a1 = sqrt(a1);
     return _mm_set_pd(_a1, _a0);
 #endif
 }
@@ -5339,10 +5317,10 @@ FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return _mm_move_sd(a, _mm_sqrt_pd(b));
 #else
-    bit64_union_t _a, _b;
-    _a.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    _b.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    return _mm_set_pd(_a.f64, sqrt(_b.f64));
+    double _a, _b;
+    _a = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    _b = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    return _mm_set_pd(_a, sqrt(_b));
 #endif
 }
 
@@ -5697,14 +5675,13 @@ FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
     double c[2];
-    c[0] = a0.f64 - b0.f64;
-    c[1] = a1.f64 - b1.f64;
+    c[0] = a0 - b0;
+    c[1] = a1 - b1;
     return vld1q_f32((float32_t *) c);
 #endif
 }
@@ -6008,12 +5985,11 @@ FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
-    double c[] = {a0.f64 + a1.f64, b0.f64 + b1.f64};
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 + a1, b0 + b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -6047,12 +6023,11 @@ FORCE_INLINE __m128d _mm_hsub_pd(__m128d a, __m128d b)
     return vreinterpretq_m128d_f64(
         vsubq_f64(vuzp1q_f64(_a, _b), vuzp2q_f64(_a, _b)));
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
-    double c[] = {a0.f64 - a1.f64, b0.f64 - b1.f64};
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double c[] = {a0 - a1, b0 - b1};
     return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
 #endif
 }
@@ -6848,10 +6823,10 @@ FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_pd(ceil(a1.f64), ceil(a0.f64));
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(ceil(a1), ceil(a0));
 #endif
 }
 
@@ -7059,13 +7034,12 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
                                    vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
                              : 0;
 #else
-    bit64_union_t a0, a1, b0, b1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    b0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0);
-    b1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1);
-    double d0 = (imm & 0x10) ? a0.f64 * b0.f64 : 0;
-    double d1 = (imm & 0x20) ? a1.f64 * b1.f64 : 0;
+    double a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    double a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    double b0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 0));
+    double b1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(b), 1));
+    double d0 = (imm & 0x10) ? a0 * b0 : 0;
+    double d1 = (imm & 0x20) ? a1 * b1 : 0;
 #endif
     __m128d tmp = _mm_set_pd(d1, d0);
 #endif
@@ -7073,10 +7047,9 @@ FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
 #if defined(__aarch64__) || defined(_M_ARM64)
     double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
 #else
-    bit64_union_t _tmp0, _tmp1;
-    _tmp0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0);
-    _tmp1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1);
-    double sum = _tmp0.f64 + _tmp1.f64;
+    double _tmp0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 0));
+    double _tmp1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(tmp), 1));
+    double sum = _tmp0 + _tmp1;
 #endif
     // Conditionally store the sum
     const __m128d sumMask =
@@ -7166,10 +7139,10 @@ FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
 #if defined(__aarch64__) || defined(_M_ARM64)
     return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
 #else
-    bit64_union_t a0, a1;
-    a0.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0);
-    a1.u64 = vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1);
-    return _mm_set_pd(floor(a1.f64), floor(a0.f64));
+    double a0, a1;
+    a0 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0));
+    a1 = recast_f64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 1));
+    return _mm_set_pd(floor(a1), floor(a0));
 #endif
 }