From 998becf772dc54d7ebba9b0224b8fd8c7fc53798 Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Wed, 11 Sep 2024 18:14:26 -0400 Subject: [PATCH 1/7] Add fast fma functions --- include/lmms_math.h | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/include/lmms_math.h b/include/lmms_math.h index 13004b0e612..e5a66e6a014 100644 --- a/include/lmms_math.h +++ b/include/lmms_math.h @@ -96,6 +96,43 @@ static void roundAt(T& value, const T& where, const T& stepSize) } } +//! Takes advantage of fma() function if present in hardware +template +T fastFma(T x, T y, T z); + +//! Takes advantage of fmaf() function if present in hardware +template<> +inline float fastFma(float x, float y, float z) +{ +#ifdef FP_FAST_FMAF + return std::fma(x, y, z); +#else + return x * y + z; +#endif +} + +//! Takes advantage of fma() function if present in hardware +template<> +inline double fastFma(double x, double y, double z) +{ +#ifdef FP_FAST_FMA + return std::fma(x, y, z); +#else + return x * y + z; +#endif +} + +//! Takes advantage of fmal() function if present in hardware +template<> +inline long double fastFma(long double x, long double y, long double z) +{ +#ifdef FP_FAST_FMAL + return std::fma(x, y, z); +#else + return x * y + z; +#endif +} + //! returns 1.0f if val >= 0.0f, -1.0 else inline float sign(float val) From ad363cfa46906142026f7651e4ad954712229ff5 Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Wed, 11 Sep 2024 20:29:02 -0400 Subject: [PATCH 2/7] Use fast fma functions --- include/interpolation.h | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/include/interpolation.h b/include/interpolation.h index 4b2d3bca9b8..683b1f000e4 100644 --- a/include/interpolation.h +++ b/include/interpolation.h @@ -71,11 +71,11 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x ) { float frsq = x*x; float frcu = frsq*v0; - float t1 = std::fma(v1, 3, v3); + float t1 = fastFma(v1, 3, v3); - return (v1 + std::fma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) - - std::fma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 * - (1.0f / 6.0f) - 0.5f * v2) + frsq * std::fma(0.5f, v2, -v1)); + return (v1 + fastFma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) - + fastFma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 * + (1.0f / 6.0f) - 0.5f * v2) + frsq * fastFma(0.5f, v2, -v1)); } @@ -83,13 +83,13 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x ) inline float cosinusInterpolate( float v0, float v1, float x ) { const float f = ( 1.0f - cosf( x * F_PI ) ) * 0.5f; - return std::fma(f, v1 - v0, v0); + return fastFma(f, v1 - v0, v0); } inline float linearInterpolate( float v0, float v1, float x ) { - return std::fma(x, v1 - v0, v0); + return fastFma(x, v1 - v0, v0); } @@ -104,7 +104,7 @@ inline float optimalInterpolate( float v0, float v1, float x ) const float c2 = even * -0.004541102062639801; const float c3 = odd * -1.57015627178718420; - return std::fma(std::fma(std::fma(c3, z, c2), z, c1), z, c0); + return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0); } @@ -121,7 +121,7 @@ inline float optimal4pInterpolate( float v0, float v1, float v2, float v3, float const float c2 = even1 * -0.246185007019907091 + even2 * 0.24614027139700284; const float c3 = odd1 * -0.36030925263849456 + odd2 * 0.10174985775982505; - return std::fma(std::fma(std::fma(c3, z, c2), z, c1), z, c0); + return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0); } @@ -132,7 +132,7 @@ inline float lagrangeInterpolate( float v0, float v1, float v2, float v3, float const float c1 = v2 - v0 * ( 1.0f / 3.0f ) - v1 * 0.5f - v3 * ( 1.0f / 6.0f ); const float c2 = 0.5f * (v0 + v2) - v1; const float c3 = ( 1.0f/6.0f ) * ( v3 - v0 ) + 0.5f * ( v1 - v2 ); - return std::fma(std::fma(std::fma(c3, x, c2), x, c1), x, c0); + return fastFma(fastFma(fastFma(c3, x, c2), x, c1), x, c0); } From 652b25e5c1d1d6f7e299333b7b9822a470decfdb Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Wed, 11 Sep 2024 20:29:33 -0400 Subject: [PATCH 3/7] Add fast pow function --- include/lmms_math.h | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/include/lmms_math.h b/include/lmms_math.h index e5a66e6a014..fc0bb60fabe 100644 --- a/include/lmms_math.h +++ b/include/lmms_math.h @@ -96,11 +96,11 @@ static void roundAt(T& value, const T& where, const T& stepSize) } } -//! Takes advantage of fma() function if present in hardware +//! Uses `x * y + z` or `std::fma` - whichever is faster template -T fastFma(T x, T y, T z); +inline T fastFma(T x, T y, T z); -//! Takes advantage of fmaf() function if present in hardware +//! Uses `x * y + z` or `std::fma` - whichever is faster template<> inline float fastFma(float x, float y, float z) { @@ -108,10 +108,10 @@ inline float fastFma(float x, float y, float z) return std::fma(x, y, z); #else return x * y + z; -#endif +#endif // FP_FAST_FMAF } -//! Takes advantage of fma() function if present in hardware +//! Uses `x * y + z` or `std::fma` - whichever is faster template<> inline double fastFma(double x, double y, double z) { @@ -119,10 +119,10 @@ inline double fastFma(double x, double y, double z) return std::fma(x, y, z); #else return x * y + z; -#endif +#endif // FP_FAST_FMA } -//! Takes advantage of fmal() function if present in hardware +//! Uses `x * y + z` or `std::fma` - whichever is faster template<> inline long double fastFma(long double x, long double y, long double z) { @@ -130,7 +130,21 @@ inline long double fastFma(long double x, long double y, long double z) return std::fma(x, y, z); #else return x * y + z; -#endif +#endif // FP_FAST_FMAL +} + + +//! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/ +inline double fastPow(double a, double b) +{ + union + { + double d; + std::int32_t x[2]; + } u = { a }; + u.x[1] = static_cast(b * (u.x[1] - 1072632447) + 1072632447); + u.x[0] = 0; + return u.d; } From afb645de7f909378049799c4c690b1d4c0b7c009 Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Wed, 11 Sep 2024 20:30:17 -0400 Subject: [PATCH 4/7] Use fast pow function --- plugins/Kicker/KickerOsc.h | 4 ++-- plugins/Monstro/Monstro.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/Kicker/KickerOsc.h b/plugins/Kicker/KickerOsc.h index a11b0fc5521..adcceaefe45 100644 --- a/plugins/Kicker/KickerOsc.h +++ b/plugins/Kicker/KickerOsc.h @@ -64,7 +64,7 @@ class KickerOsc { for( fpp_t frame = 0; frame < frames; ++frame ) { - const double gain = 1 - std::pow((m_counter < m_length) ? m_counter / m_length : 1, m_env); + const double gain = 1 - fastPow((m_counter < m_length) ? m_counter / m_length : 1, m_env); const sample_t s = ( Oscillator::sinSample( m_phase ) * ( 1 - m_noise ) ) + ( Oscillator::noiseSample( 0 ) * gain * gain * m_noise ); buf[frame][0] = s * gain; buf[frame][1] = s * gain; @@ -80,7 +80,7 @@ class KickerOsc m_FX.nextSample( buf[frame][0], buf[frame][1] ); m_phase += m_freq / sampleRate; - const double change = (m_counter < m_length) ? ((m_startFreq - m_endFreq) * (1 - std::pow(m_counter / m_length, m_slope))) : 0; + const double change = (m_counter < m_length) ? ((m_startFreq - m_endFreq) * (1 - fastPow(m_counter / m_length, m_slope))) : 0; m_freq = m_endFreq + change; ++m_counter; } diff --git a/plugins/Monstro/Monstro.cpp b/plugins/Monstro/Monstro.cpp index d53fcc9137d..ff92abbb647 100644 --- a/plugins/Monstro/Monstro.cpp +++ b/plugins/Monstro/Monstro.cpp @@ -858,7 +858,7 @@ inline sample_t MonstroSynth::calcSlope( int slope, sample_t s ) { if( m_parent->m_slope[slope] == 1.0f ) return s; if( s == 0.0f ) return s; - return std::pow(s, m_parent->m_slope[slope]); + return fastPow(s, m_parent->m_slope[slope]); } From 47c829589b2728abfcfd7b8babe019d0a6bdbd7e Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Wed, 11 Sep 2024 21:02:22 -0400 Subject: [PATCH 5/7] Fix build --- include/interpolation.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/interpolation.h b/include/interpolation.h index 683b1f000e4..85b042adaa1 100644 --- a/include/interpolation.h +++ b/include/interpolation.h @@ -69,9 +69,9 @@ inline float hermiteInterpolate( float x0, float x1, float x2, float x3, inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x ) { - float frsq = x*x; - float frcu = frsq*v0; - float t1 = fastFma(v1, 3, v3); + float frsq = x * x; + float frcu = frsq * v0; + float t1 = fastFma(v1, 3.f, v3); return (v1 + fastFma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) - fastFma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 * From e82932a553c02fb634a2ef5e349e45c36b480050 Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Tue, 24 Sep 2024 22:36:28 -0400 Subject: [PATCH 6/7] Remove fastFma --- include/interpolation.h | 18 +++++++++--------- include/lmms_math.h | 38 -------------------------------------- 2 files changed, 9 insertions(+), 47 deletions(-) diff --git a/include/interpolation.h b/include/interpolation.h index 85b042adaa1..0b972bc279e 100644 --- a/include/interpolation.h +++ b/include/interpolation.h @@ -71,11 +71,11 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x ) { float frsq = x * x; float frcu = frsq * v0; - float t1 = fastFma(v1, 3.f, v3); + float t1 = v1 * 3.f + v3; - return (v1 + fastFma(0.5f, frcu, x) * (v2 - frcu * (1.0f / 6.0f) - - fastFma(t1, (1.0f / 6.0f), -v0) * (1.0f / 3.0f)) + frsq * x * (t1 * - (1.0f / 6.0f) - 0.5f * v2) + frsq * fastFma(0.5f, v2, -v1)); + return v1 + (0.5f * frcu + x) * (v2 - frcu * (1.0f / 6.0f) - + (t1 * (1.0f / 6.0f) - v0) * (1.0f / 3.0f)) + frsq * x * (t1 * + (1.0f / 6.0f) - 0.5f * v2) + frsq * (0.5f * v2 - v1); } @@ -83,13 +83,13 @@ inline float cubicInterpolate( float v0, float v1, float v2, float v3, float x ) inline float cosinusInterpolate( float v0, float v1, float x ) { const float f = ( 1.0f - cosf( x * F_PI ) ) * 0.5f; - return fastFma(f, v1 - v0, v0); + return f * (v1 - v0) + v0; } inline float linearInterpolate( float v0, float v1, float x ) { - return fastFma(x, v1 - v0, v0); + return x * (v1 - v0) + v0; } @@ -104,7 +104,7 @@ inline float optimalInterpolate( float v0, float v1, float x ) const float c2 = even * -0.004541102062639801; const float c3 = odd * -1.57015627178718420; - return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0); + return ((c3 * z + c2) * z + c1) * z + c0; } @@ -121,7 +121,7 @@ inline float optimal4pInterpolate( float v0, float v1, float v2, float v3, float const float c2 = even1 * -0.246185007019907091 + even2 * 0.24614027139700284; const float c3 = odd1 * -0.36030925263849456 + odd2 * 0.10174985775982505; - return fastFma(fastFma(fastFma(c3, z, c2), z, c1), z, c0); + return ((c3 * z + c2) * z + c1) * z + c0; } @@ -132,7 +132,7 @@ inline float lagrangeInterpolate( float v0, float v1, float v2, float v3, float const float c1 = v2 - v0 * ( 1.0f / 3.0f ) - v1 * 0.5f - v3 * ( 1.0f / 6.0f ); const float c2 = 0.5f * (v0 + v2) - v1; const float c3 = ( 1.0f/6.0f ) * ( v3 - v0 ) + 0.5f * ( v1 - v2 ); - return fastFma(fastFma(fastFma(c3, x, c2), x, c1), x, c0); + return ((c3 * x + c2) * x + c1) * x + c0; } diff --git a/include/lmms_math.h b/include/lmms_math.h index fc0bb60fabe..642336e65eb 100644 --- a/include/lmms_math.h +++ b/include/lmms_math.h @@ -96,44 +96,6 @@ static void roundAt(T& value, const T& where, const T& stepSize) } } -//! Uses `x * y + z` or `std::fma` - whichever is faster -template -inline T fastFma(T x, T y, T z); - -//! Uses `x * y + z` or `std::fma` - whichever is faster -template<> -inline float fastFma(float x, float y, float z) -{ -#ifdef FP_FAST_FMAF - return std::fma(x, y, z); -#else - return x * y + z; -#endif // FP_FAST_FMAF -} - -//! Uses `x * y + z` or `std::fma` - whichever is faster -template<> -inline double fastFma(double x, double y, double z) -{ -#ifdef FP_FAST_FMA - return std::fma(x, y, z); -#else - return x * y + z; -#endif // FP_FAST_FMA -} - -//! Uses `x * y + z` or `std::fma` - whichever is faster -template<> -inline long double fastFma(long double x, long double y, long double z) -{ -#ifdef FP_FAST_FMAL - return std::fma(x, y, z); -#else - return x * y + z; -#endif // FP_FAST_FMAL -} - - //! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/ inline double fastPow(double a, double b) { From a61a6ffaa93199627a2451d2b13d014cc6188a7b Mon Sep 17 00:00:00 2001 From: Dalton Messmer Date: Tue, 24 Sep 2024 23:49:57 -0400 Subject: [PATCH 7/7] Avoid UB in fastPow On GCC with -O1 or -O2 optimizations, this new implementation generates identical assembly to the old union-based implementation --- include/lmms_math.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/include/lmms_math.h b/include/lmms_math.h index 642336e65eb..72800838818 100644 --- a/include/lmms_math.h +++ b/include/lmms_math.h @@ -27,12 +27,13 @@ #include #include +#include #include #include +#include #include "lmms_constants.h" #include "lmmsconfig.h" -#include namespace lmms { @@ -99,14 +100,15 @@ static void roundAt(T& value, const T& where, const T& stepSize) //! Source: http://martin.ankerl.com/2007/10/04/optimized-pow-approximation-for-java-and-c-c/ inline double fastPow(double a, double b) { - union - { - double d; - std::int32_t x[2]; - } u = { a }; - u.x[1] = static_cast(b * (u.x[1] - 1072632447) + 1072632447); - u.x[0] = 0; - return u.d; + double d; + std::int32_t x[2]; + + std::memcpy(x, &a, sizeof(x)); + x[1] = static_cast(b * (x[1] - 1072632447) + 1072632447); + x[0] = 0; + + std::memcpy(&d, x, sizeof(d)); + return d; }