From 998becf772dc54d7ebba9b0224b8fd8c7fc53798 Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Wed, 11 Sep 2024 18:14:26 -0400
Subject: [PATCH 1/7] Add fast fma functions

---
 include/lmms_math.h | 37 +++++++++++++++++++++++++++++++++++++
 1 file changed, 37 insertions(+)
diff --git a/include/lmms_math.h b/include/lmms_math.h
index 13004b0e612..e5a66e6a014 100644
--- a/include/lmms_math.h
+++ b/include/lmms_math.h
@@ -96,6 +96,43 @@ static void roundAt(T& value, const T& where, const T& stepSize)
 	}
 }
 
+//! Takes advantage of fma() function if present in hardware
+template<typename T>
+T fastFma(T x, T y, T z);
+
+//! Takes advantage of fmaf() function if present in hardware
+template<>
+inline float fastFma(float x, float y, float z)
+{
+#ifdef FP_FAST_FMAF
+	return std::fma(x, y, z);
+#else
+	return x * y + z;
+#endif
+}
+
+//! Takes advantage of fma() function if present in hardware
+template<>
+inline double fastFma(double x, double y, double z)
+{
+#ifdef FP_FAST_FMA
+	return std::fma(x, y, z);
+#else
+	return x * y + z;
+#endif
+}
+
+//! Takes advantage of fmal() function if present in hardware
+template<>
+inline long double fastFma(long double x, long double y, long double z)
+{
+#ifdef FP_FAST_FMAL
+	return std::fma(x, y, z);
+#else
+	return x * y + z;
+#endif
+}
+
 
 //! returns 1.0f if val >= 0.0f, -1.0 else
 inline float sign(float val) 

From ad363cfa46906142026f7651e4ad954712229ff5 Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Wed, 11 Sep 2024 20:29:02 -0400
Subject: [PATCH 2/7] Use fast fma functions

---
 include/interpolation.h | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/include/interpolation.h b/include/interpolation.h
index 4b2d3bca9b8..683b1f000e4 100644
--- a/include/interpolation.h
+++ b/include/interpolation.h
@@ -71,11 +71,11 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 {
 	float frsq = x*x;
 	float frcu = frsq*v0;
-	float t1 = std::fma(v1, 3, v3);
+	float t1 = fastFma(v1, 3, v3);
 
-	return (v1 + std::fma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) -
-		std::fma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 *
-		(1.0f / 6.0f) - 0.5f * v2) + frsq * std::fma(0.5f, v2, -v1));
+	return (v1 + fastFma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) -
+		fastFma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 *
+		(1.0f / 6.0f) - 0.5f * v2) + frsq * fastFma(0.5f, v2, -v1));
 }
 
 
@@ -83,13 +83,13 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 inline float cosinusInterpolate( float v0, float v1, float x )
 {
 	const float f = ( 1.0f - cosf( x * F_PI ) ) * 0.5f;
-	return std::fma(f, v1 - v0, v0);
+	return fastFma(f, v1 - v0, v0);
 }
 
 
 inline float linearInterpolate( float v0, float v1, float x )
 {
-	return std::fma(x, v1 - v0, v0);
+	return fastFma(x, v1 - v0, v0);
 }
 
 
@@ -104,7 +104,7 @@ inline float optimalInterpolate( float v0, float v1, float x )
 	const float c2 = even * -0.004541102062639801;
 	const float c3 = odd * -1.57015627178718420;
 	
-	return std::fma(std::fma(std::fma(c3, z, c2), z, c1), z, c0);
+	return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0);
 }
 
 
@@ -121,7 +121,7 @@ inline float optimal4pInterpolate( float v0, float v1, float v2, float v3, float
 	const float c2 = even1 * -0.246185007019907091 + even2 * 0.24614027139700284;
 	const float c3 = odd1 * -0.36030925263849456 + odd2 * 0.10174985775982505;
 
-	return std::fma(std::fma(std::fma(c3, z, c2), z, c1), z, c0);
+	return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0);
 }
 
 
@@ -132,7 +132,7 @@ inline float lagrangeInterpolate( float v0, float v1, float v2, float v3, float
 	const float c1 = v2 - v0 * ( 1.0f / 3.0f ) - v1 * 0.5f - v3 * ( 1.0f / 6.0f );
 	const float c2 = 0.5f * (v0 + v2) - v1;
 	const float c3 = ( 1.0f/6.0f ) * ( v3 - v0 ) + 0.5f * ( v1 - v2 );
-	return std::fma(std::fma(std::fma(c3, x, c2), x, c1), x, c0);
+	return fastFma(fastFma(fastFma(c3, x, c2), x, c1), x, c0);
 }
 
 

From 652b25e5c1d1d6f7e299333b7b9822a470decfdb Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Wed, 11 Sep 2024 20:29:33 -0400
Subject: [PATCH 3/7] Add fast pow function

---
 include/lmms_math.h | 30 ++++++++++++++++++++++--------
 1 file changed, 22 insertions(+), 8 deletions(-)

diff --git a/include/lmms_math.h b/include/lmms_math.h
index e5a66e6a014..fc0bb60fabe 100644
--- a/include/lmms_math.h
+++ b/include/lmms_math.h
@@ -96,11 +96,11 @@ static void roundAt(T& value, const T& where, const T& stepSize)
 	}
 }
 
-//! Takes advantage of fma() function if present in hardware
+//! Uses `x * y + z` or `std::fma` - whichever is faster
 template<typename T>
-T fastFma(T x, T y, T z);
+inline T fastFma(T x, T y, T z);
 
-//! Takes advantage of fmaf() function if present in hardware
+//! Uses `x * y + z` or `std::fma` - whichever is faster
 template<>
 inline float fastFma(float x, float y, float z)
 {
@@ -108,10 +108,10 @@ inline float fastFma(float x, float y, float z)
 	return std::fma(x, y, z);
 #else
 	return x * y + z;
-#endif
+#endif // FP_FAST_FMAF
 }
 
-//! Takes advantage of fma() function if present in hardware
+//! Uses `x * y + z` or `std::fma` - whichever is faster
 template<>
 inline double fastFma(double x, double y, double z)
 {
@@ -119,10 +119,10 @@ inline double fastFma(double x, double y, double z)
 	return std::fma(x, y, z);
 #else
 	return x * y + z;
-#endif
+#endif // FP_FAST_FMA
 }
 
-//! Takes advantage of fmal() function if present in hardware
+//! Uses `x * y + z` or `std::fma` - whichever is faster
 template<>
 inline long double fastFma(long double x, long double y, long double z)
 {
@@ -130,7 +130,21 @@ inline long double fastFma(long double x, long double y, long double z)
 	return std::fma(x, y, z);
 #else
 	return x * y + z;
-#endif
+#endif // FP_FAST_FMAL
+}
+
+
+//! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
+inline double fastPow(double a, double b)
+{
+	union
+	{
+		double d;
+		std::int32_t x[2];
+	} u = { a };
+	u.x[1] = static_cast<std::int32_t>(b * (u.x[1] - 1072632447) + 1072632447);
+	u.x[0] = 0;
+	return u.d;
 }
 
 

From afb645de7f909378049799c4c690b1d4c0b7c009 Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Wed, 11 Sep 2024 20:30:17 -0400
Subject: [PATCH 4/7] Use fast pow function

---
 plugins/Kicker/KickerOsc.h  | 4 ++--
 plugins/Monstro/Monstro.cpp | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/plugins/Kicker/KickerOsc.h b/plugins/Kicker/KickerOsc.h
index a11b0fc5521..adcceaefe45 100644
--- a/plugins/Kicker/KickerOsc.h
+++ b/plugins/Kicker/KickerOsc.h
@@ -64,7 +64,7 @@ class KickerOsc
 	{
 		for( fpp_t frame = 0; frame < frames; ++frame )
 		{
-			const double gain = 1 - std::pow((m_counter < m_length) ? m_counter / m_length : 1, m_env);
+			const double gain = 1 - fastPow((m_counter < m_length) ? m_counter / m_length : 1, m_env);
 			const sample_t s = ( Oscillator::sinSample( m_phase ) * ( 1 - m_noise ) ) + ( Oscillator::noiseSample( 0 ) * gain * gain * m_noise );
 			buf[frame][0] = s * gain;
 			buf[frame][1] = s * gain;
@@ -80,7 +80,7 @@ class KickerOsc
 			m_FX.nextSample( buf[frame][0], buf[frame][1] );
 			m_phase += m_freq / sampleRate;
 
-			const double change = (m_counter < m_length) ? ((m_startFreq - m_endFreq) * (1 - std::pow(m_counter / m_length, m_slope))) : 0;
+			const double change = (m_counter < m_length) ? ((m_startFreq - m_endFreq) * (1 - fastPow(m_counter / m_length, m_slope))) : 0;
 			m_freq = m_endFreq + change;
 			++m_counter;
 		}
diff --git a/plugins/Monstro/Monstro.cpp b/plugins/Monstro/Monstro.cpp
index d53fcc9137d..ff92abbb647 100644
--- a/plugins/Monstro/Monstro.cpp
+++ b/plugins/Monstro/Monstro.cpp
@@ -858,7 +858,7 @@ inline sample_t MonstroSynth::calcSlope( int slope, sample_t s )
 {
 	if( m_parent->m_slope[slope] == 1.0f ) return s;
 	if( s == 0.0f ) return s;
-	return std::pow(s, m_parent->m_slope[slope]);
+	return fastPow(s, m_parent->m_slope[slope]);
 }
 
 

From 47c829589b2728abfcfd7b8babe019d0a6bdbd7e Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Wed, 11 Sep 2024 21:02:22 -0400
Subject: [PATCH 5/7] Fix build

---
 include/interpolation.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/interpolation.h b/include/interpolation.h
index 683b1f000e4..85b042adaa1 100644
--- a/include/interpolation.h
+++ b/include/interpolation.h
@@ -69,9 +69,9 @@ inline float hermiteInterpolate( float x0, float x1, float x2, float x3,
 
 inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 {
-	float frsq = x*x;
-	float frcu = frsq*v0;
-	float t1 = fastFma(v1, 3, v3);
+	float frsq = x * x;
+	float frcu = frsq * v0;
+	float t1 = fastFma(v1, 3.f, v3);
 
 	return (v1 + fastFma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) -
 		fastFma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 *

From e82932a553c02fb634a2ef5e349e45c36b480050 Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Tue, 24 Sep 2024 22:36:28 -0400
Subject: [PATCH 6/7] Remove fastFma

---
 include/interpolation.h | 18 +++++++++---------
 include/lmms_math.h     | 38 --------------------------------------
 2 files changed, 9 insertions(+), 47 deletions(-)

diff --git a/include/interpolation.h b/include/interpolation.h
index 85b042adaa1..0b972bc279e 100644
--- a/include/interpolation.h
+++ b/include/interpolation.h
@@ -71,11 +71,11 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 {
 	float frsq = x * x;
 	float frcu = frsq * v0;
-	float t1 = fastFma(v1, 3.f, v3);
+	float t1 = v1 * 3.f + v3;
 
-	return (v1 + fastFma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) -
-		fastFma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 *
-		(1.0f / 6.0f) - 0.5f * v2) + frsq * fastFma(0.5f, v2, -v1));
+	return v1 + (0.5f * frcu + x) * (v2 - frcu * (1.0f / 6.0f) -
+		(t1 * (1.0f / 6.0f) - v0) * (1.0f / 3.0f)) + frsq * x * (t1 *
+		(1.0f / 6.0f) - 0.5f * v2) + frsq * (0.5f * v2 - v1);
 }
 
 
@@ -83,13 +83,13 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x )
 inline float cosinusInterpolate( float v0, float v1, float x )
 {
 	const float f = ( 1.0f - cosf( x * F_PI ) ) * 0.5f;
-	return fastFma(f, v1 - v0, v0);
+	return f * (v1 - v0) + v0;
 }
 
 
 inline float linearInterpolate( float v0, float v1, float x )
 {
-	return fastFma(x, v1 - v0, v0);
+	return x * (v1 - v0) + v0;
 }
 
 
@@ -104,7 +104,7 @@ inline float optimalInterpolate( float v0, float v1, float x )
 	const float c2 = even * -0.004541102062639801;
 	const float c3 = odd * -1.57015627178718420;
 	
-	return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0);
+	return ((c3 * z + c2) * z + c1) * z + c0;
 }
 
 
@@ -121,7 +121,7 @@ inline float optimal4pInterpolate( float v0, float v1, float v2, float v3, float
 	const float c2 = even1 * -0.246185007019907091 + even2 * 0.24614027139700284;
 	const float c3 = odd1 * -0.36030925263849456 + odd2 * 0.10174985775982505;
 
-	return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0);
+	return ((c3 * z + c2) * z + c1) * z + c0;
 }
 
 
@@ -132,7 +132,7 @@ inline float lagrangeInterpolate( float v0, float v1, float v2, float v3, float
 	const float c1 = v2 - v0 * ( 1.0f / 3.0f ) - v1 * 0.5f - v3 * ( 1.0f / 6.0f );
 	const float c2 = 0.5f * (v0 + v2) - v1;
 	const float c3 = ( 1.0f/6.0f ) * ( v3 - v0 ) + 0.5f * ( v1 - v2 );
-	return fastFma(fastFma(fastFma(c3, x, c2), x, c1), x, c0);
+	return ((c3 * x + c2) * x + c1) * x + c0;
 }
 
 
diff --git a/include/lmms_math.h b/include/lmms_math.h
index fc0bb60fabe..642336e65eb 100644
--- a/include/lmms_math.h
+++ b/include/lmms_math.h
@@ -96,44 +96,6 @@ static void roundAt(T& value, const T& where, const T& stepSize)
 	}
 }
 
-//! Uses `x * y + z` or `std::fma` - whichever is faster
-template<typename T>
-inline T fastFma(T x, T y, T z);
-
-//! Uses `x * y + z` or `std::fma` - whichever is faster
-template<>
-inline float fastFma(float x, float y, float z)
-{
-#ifdef FP_FAST_FMAF
-	return std::fma(x, y, z);
-#else
-	return x * y + z;
-#endif // FP_FAST_FMAF
-}
-
-//! Uses `x * y + z` or `std::fma` - whichever is faster
-template<>
-inline double fastFma(double x, double y, double z)
-{
-#ifdef FP_FAST_FMA
-	return std::fma(x, y, z);
-#else
-	return x * y + z;
-#endif // FP_FAST_FMA
-}
-
-//! Uses `x * y + z` or `std::fma` - whichever is faster
-template<>
-inline long double fastFma(long double x, long double y, long double z)
-{
-#ifdef FP_FAST_FMAL
-	return std::fma(x, y, z);
-#else
-	return x * y + z;
-#endif // FP_FAST_FMAL
-}
-
-
 //! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
 inline double fastPow(double a, double b)
 {

From a61a6ffaa93199627a2451d2b13d014cc6188a7b Mon Sep 17 00:00:00 2001
From: Dalton Messmer <messmer.dalton@gmail.com>
Date: Tue, 24 Sep 2024 23:49:57 -0400
Subject: [PATCH 7/7] Avoid UB in fastPow

On GCC with -O1 or -O2 optimizations, this new implementation generates
identical assembly to the old union-based implementation
---
 include/lmms_math.h | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/include/lmms_math.h b/include/lmms_math.h
index 642336e65eb..72800838818 100644
--- a/include/lmms_math.h
+++ b/include/lmms_math.h
@@ -27,12 +27,13 @@
 
 #include <QtGlobal>
 #include <algorithm>
+#include <cassert>
 #include <cmath>
 #include <cstdint>
+#include <cstring>
 
 #include "lmms_constants.h"
 #include "lmmsconfig.h"
-#include <cassert>
 
 namespace lmms
 {
@@ -99,14 +100,15 @@ static void roundAt(T& value, const T& where, const T& stepSize)
 //! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/
 inline double fastPow(double a, double b)
 {
-	union
-	{
-		double d;
-		std::int32_t x[2];
-	} u = { a };
-	u.x[1] = static_cast<std::int32_t>(b * (u.x[1] - 1072632447) + 1072632447);
-	u.x[0] = 0;
-	return u.d;
+	double d;
+	std::int32_t x[2];
+
+	std::memcpy(x, &a, sizeof(x));
+	x[1] = static_cast<std::int32_t>(b * (x[1] - 1072632447) + 1072632447);
+	x[0] = 0;
+
+	std::memcpy(&d, x, sizeof(d));
+	return d;
 }