From fec24a531433ea828f0fc8412d4699b8a96aa31b Mon Sep 17 00:00:00 2001 From: Fraser Cormack Date: Tue, 10 Sep 2024 11:33:10 +0100 Subject: [PATCH] Remove mixed use of tabs and spaces Tabs can been replaced with spaces as spaces were predominant. Formatting was done with clang-format on functions containing tabs - rather than the whole file - to try and reduce the diff. --- oclmath/reference_math.cpp | 4073 ++++++++++++++++++------------------ 1 file changed, 2033 insertions(+), 2040 deletions(-) diff --git a/oclmath/reference_math.cpp b/oclmath/reference_math.cpp index 3d4fbe4e5..7769eb0b5 100644 --- a/oclmath/reference_math.cpp +++ b/oclmath/reference_math.cpp @@ -7,13 +7,19 @@ ******************************************************************/ #if defined( _MSC_VER ) -# pragma warning( push ) -# pragma warning( disable : 4056 ) /* overflow in floating-point constant arithmetic */ -# pragma warning( disable : 4101 ) /* unreferenced local variable */ -# pragma warning( disable : 4146 ) /* unary minus operator applied to unsigned type, result still unsigned */ -# pragma warning( disable : 4244 ) /* conversion from 'double' to 'float', possible loss of data */ -# pragma warning( disable : 4723 ) /* potential divide by 0 */ -# pragma warning( disable : 4756 ) /* overflow in constant arithmetic */ +#pragma warning(push) +// overflow in floating-point constant arithmetic +#pragma warning(disable : 4056) +// unreferenced local variable +#pragma warning(disable : 4101) +// unary minus operator applied to unsigned type, result still unsigned +#pragma warning(disable : 4146) +// conversion from 'double' to 'float', possible loss of data +#pragma warning(disable : 4244) +// potential divide by 0 +#pragma warning(disable : 4723) +// overflow in constant arithmetic +#pragma warning(disable : 4756) #endif // @@ -53,7 +59,7 @@ #define tanhl(X) tanh(X) #define coshl(X) cosh(X) #define sinhl(X) sinh(X) - + float modff_android( float value, float *iptr ) { *iptr = truncf( value ); @@ -126,8 +132,8 @@ static void __log2_ep(double *hi, double *lo, double x); typedef union { - uint64_t i; - double d; + uint64_t i; + double d; }uint64d_t; static const uint64d_t _CL_NAN = { 0x7ff8000000000000ULL }; @@ -882,41 +888,35 @@ double reference_powr( double x, double y ) //powr ( +-0, y ) is +inf for finite y < 0. if( y < 0.0 ) return INFINITY; - - //powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) + + // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) return 0.0; } - - // x = +inf - if( isinf(x) ) - { - if( y < 0 ) - return 0; - return INFINITY; - } - - double fabsx = reference_fabs(x); - double fabsy = reference_fabs(y); - - //y = +-inf cases - if( isinf(fabsy) ) - { - if( y < 0 ) - { - if( fabsx < 1 ) - return INFINITY; - return 0; - } - if( fabsx < 1 ) - return 0; - return INFINITY; - } - - double hi, lo; - __log2_ep(&hi, &lo, x); - double prod = y * hi; - double result = reference_exp2(prod); - + + // x = +inf + if (isinf(x)) { + if (y < 0) return 0; + return INFINITY; + } + + double fabsx = reference_fabs(x); + double fabsy = reference_fabs(y); + + // y = +-inf cases + if (isinf(fabsy)) { + if (y < 0) { + if (fabsx < 1) return INFINITY; + return 0; + } + if (fabsx < 1) return 0; + return INFINITY; + } + + double hi, lo; + __log2_ep(&hi, &lo, x); + double prod = y * hi; + double result = reference_exp2(prod); + return result; } @@ -951,25 +951,34 @@ double reference_add( double x, double y ) __m128 va = _mm_set_ss( (float) a ); __m128 vb = _mm_set_ss( (float) b ); va = _mm_add_ss( va, vb ); - _mm_store_ss( (float*) &a, va ); + _mm_store_ss((float*)&a, va); #elif defined(__PPC__) - // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero. - // As such, the reference add with FTZ must be emulated in sw. + // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes + // denorm's to zero. As such, the reference add with FTZ must be emulated in + // sw. if (fpu_control & _FPU_MASK_NI) { - union{ cl_uint u; cl_float d; } ua; ua.d = a; - union{ cl_uint u; cl_float d; } ub; ub.d = b; + union { + cl_uint u; + cl_float d; + } ua; + ua.d = a; + union { + cl_uint u; + cl_float d; + } ub; + ub.d = b; cl_uint mantA, mantB; cl_ulong addendA, addendB, sum; - int expA = extractf( a, &mantA ); - int expB = extractf( b, &mantB ); + int expA = extractf(a, &mantA); + int expB = extractf(b, &mantB); cl_uint signA = ua.u & 0x80000000U; cl_uint signB = ub.u & 0x80000000U; // Force matching exponents if an operand is 0 if (a == 0.0f) { - expA = expB; + expA = expB; } else if (b == 0.0f) { - expB = expA; + expB = expA; } addendA = (cl_ulong)mantA << 32; @@ -977,23 +986,22 @@ double reference_add( double x, double y ) if (expA >= expB) { // Shift B relative to the A so that their exponents match - if( expA > expB ) - shift_right_sticky_64( &addendB, expA - expB ); + if (expA > expB) shift_right_sticky_64(&addendB, expA - expB); // add - if( signA ^ signB ) - sub64( &addendA, addendB, &signA, &expA ); + if (signA ^ signB) + sub64(&addendA, addendB, &signA, &expA); else - add64( &addendA, addendB, &expA ); - } else { + add64(&addendA, addendB, &expA); + } else { // Shift the A relative to B so that their exponents match - shift_right_sticky_64( &addendA, expB - expA ); + shift_right_sticky_64(&addendA, expB - expA); // add - if( signA ^ signB ) - sub64( &addendB, addendA, &signB, &expB ); + if (signA ^ signB) + sub64(&addendB, addendA, &signB, &expB); else - add64( &addendB, addendA, &expB ); + add64(&addendB, addendA, &expB); addendA = addendB; expA = expB; @@ -1001,10 +1009,10 @@ double reference_add( double x, double y ) } // round to IEEE result - if (gIsInRTZMode) { - ua.d = round_toward_zero_float_ftz( addendA, expA ); + if (gIsInRTZMode) { + ua.d = round_toward_zero_float_ftz(addendA, expA); } else { - ua.d = round_to_nearest_even_float_ftz( addendA, expA ); + ua.d = round_to_nearest_even_float_ftz(addendA, expA); } // Set the sign ua.u |= signA; @@ -1015,74 +1023,86 @@ double reference_add( double x, double y ) #else a += b; #endif - return (double) a; - } - - -double reference_subtract( double x, double y ) -{ - volatile float a = (float) x; - volatile float b = (float) y; -#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) - // defeat x87 - __m128 va = _mm_set_ss( (float) a ); - __m128 vb = _mm_set_ss( (float) b ); - va = _mm_sub_ss( va, vb ); - _mm_store_ss( (float*) &a, va ); + return (double)a; +} + +double reference_subtract(double x, double y) { + volatile float a = (float)x; + volatile float b = (float)y; +#if defined(__SSE__) || \ + (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) + // defeat x87 + __m128 va = _mm_set_ss((float)a); + __m128 vb = _mm_set_ss((float)b); + va = _mm_sub_ss(va, vb); + _mm_store_ss((float*)&a, va); #else - a -= b; + a -= b; #endif - return a; -} + return a; +} + +// double reference_divide( double x, double y ){ return (float) x / (float) y; +// } +double reference_multiply(double x, double y) { + volatile float a = (float)x; + volatile float b = (float)y; +#if defined(__SSE__) || \ + (defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64))) + // defeat x87 + __m128 va = _mm_set_ss((float)a); + __m128 vb = _mm_set_ss((float)b); + va = _mm_mul_ss(va, vb); + _mm_store_ss((float*)&a, va); +#elif defined(__PPC__) + // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes + // denorm's to zero. As such, the reference multiply with FTZ must be emulated + // in sw. + if (fpu_control & _FPU_MASK_NI) { + // extract exponent and mantissa + // exponent is a standard unbiased signed integer + // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB + union { + cl_uint u; + cl_float d; + } ua; + ua.d = a; + union { + cl_uint u; + cl_float d; + } ub; + ub.d = b; + cl_uint mantA, mantB; + int expA = extractf(a, &mantA); + int expB = extractf(b, &mantB); + + // exact product of A and B + int exponent = expA + expB; + cl_uint sign = (ua.u ^ ub.u) & 0x80000000U; + cl_ulong product = (cl_ulong)mantA * (cl_ulong)mantB; + + // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999.. + // The MSB might not be set. If so, fix that. Otherwise, reflect the fact + // that we got another power of two from the multiplication + if (0 == (0x8000000000000000ULL & product)) + product <<= 1; + else + exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our + // exponent increased. -//double reference_divide( double x, double y ){ return (float) x / (float) y; } -double reference_multiply( double x, double y) -{ - volatile float a = (float) x; - volatile float b = (float) y; -#if defined( __SSE__ ) || (defined( _MSC_VER ) && (defined(_M_IX86) || defined(_M_X64))) - // defeat x87 - __m128 va = _mm_set_ss( (float) a ); - __m128 vb = _mm_set_ss( (float) b ); - va = _mm_mul_ss( va, vb ); - _mm_store_ss( (float*) &a, va ); -#elif defined(__PPC__) - // Most Power host CPUs do not support the non-IEEE mode (NI) which flushes denorm's to zero. - // As such, the reference multiply with FTZ must be emulated in sw. - if (fpu_control & _FPU_MASK_NI) { - // extract exponent and mantissa - // exponent is a standard unbiased signed integer - // mantissa is a cl_uint, with leading non-zero bit positioned at the MSB - union{ cl_uint u; cl_float d; } ua; ua.d = a; - union{ cl_uint u; cl_float d; } ub; ub.d = b; - cl_uint mantA, mantB; - int expA = extractf( a, &mantA ); - int expB = extractf( b, &mantB ); - - // exact product of A and B - int exponent = expA + expB; - cl_uint sign = (ua.u ^ ub.u) & 0x80000000U; - cl_ulong product = (cl_ulong) mantA * (cl_ulong) mantB; - - // renormalize -- 1.m * 1.n yields a number between 1.0 and 3.99999.. - // The MSB might not be set. If so, fix that. Otherwise, reflect the fact that we got another power of two from the multiplication - if( 0 == (0x8000000000000000ULL & product) ) - product <<= 1; - else - exponent++; // 2**31 * 2**31 gives 2**62. If the MSB was set, then our exponent increased. - - // round to IEEE result -- we do not do flushing to zero here. That part is handled manually in ternary.c. - if (gIsInRTZMode) { - ua.d = round_toward_zero_float_ftz( product, exponent); - } else { - ua.d = round_to_nearest_even_float_ftz( product, exponent); - } - // Set the sign - ua.u |= sign; - a = ua.d; + // round to IEEE result -- we do not do flushing to zero here. That part is + // handled manually in ternary.c. + if (gIsInRTZMode) { + ua.d = round_toward_zero_float_ftz(product, exponent); } else { - a *= b; + ua.d = round_to_nearest_even_float_ftz(product, exponent); } + // Set the sign + ua.u |= sign; + a = ua.d; + } else { + a *= b; + } #else a *= b; #endif @@ -1103,9 +1123,9 @@ double reference_multiply( double x, double y) }*/ double reference_lgamma_r( double x, int *signp ) { - // This is not currently tested - *signp = 0; - return x; + // This is not currently tested + *signp = 0; + return x; } @@ -1354,21 +1374,28 @@ double reference_expm1( double x ) static const double reduction[17] = { -0.5, -0.4375, -0.375, -0.3125, -0.25, -0.1875, -0.125, -0.0625, 0.0, +0.0625, +0.125, +0.1875, +0.25, +0.3125, +0.375, +0.4375, +0.5 }; - - + // exponentials[i] = expm1(reduction[i]) - static const double exponentials[17] = { HEX_DBL( -, 1, 92e9a0720d3ec, -, 2 ), HEX_DBL( -, 1, 6adb1cd9205ee, -, 2 ), - HEX_DBL( -, 1, 40373d42ce2e3, -, 2 ), HEX_DBL( -, 1, 12d35a41ba104, -, 2 ), - HEX_DBL( -, 1, c5041854df7d4, -, 3 ), HEX_DBL( -, 1, 5e25fb4fde211, -, 3 ), - HEX_DBL( -, 1, e14aed893eef4, -, 4 ), HEX_DBL( -, 1, f0540438fd5c3, -, 5 ), - HEX_DBL( +, 0, 0, +, 0 ), - HEX_DBL( +, 1, 082b577d34ed8, -, 4 ), HEX_DBL( +, 1, 10b022db7ae68, -, 3 ), - HEX_DBL( +, 1, a65c0b85ac1a9, -, 3 ), HEX_DBL( +, 1, 22d78f0fa061a, -, 2 ), - HEX_DBL( +, 1, 77a45d8117fd5, -, 2 ), HEX_DBL( +, 1, d1e944f6fbdaa, -, 2 ), - HEX_DBL( +, 1, 190048ef6002, -, 1 ), HEX_DBL( +, 1, 4c2531c3c0d38, -, 1 ), - }; - - + static const double exponentials[17] = { + HEX_DBL(-, 1, 92e9a0720d3ec, -, 2), + HEX_DBL(-, 1, 6adb1cd9205ee, -, 2), + HEX_DBL(-, 1, 40373d42ce2e3, -, 2), + HEX_DBL(-, 1, 12d35a41ba104, -, 2), + HEX_DBL(-, 1, c5041854df7d4, -, 3), + HEX_DBL(-, 1, 5e25fb4fde211, -, 3), + HEX_DBL(-, 1, e14aed893eef4, -, 4), + HEX_DBL(-, 1, f0540438fd5c3, -, 5), + HEX_DBL(+, 0, 0, +, 0), + HEX_DBL(+, 1, 082b577d34ed8, -, 4), + HEX_DBL(+, 1, 10b022db7ae68, -, 3), + HEX_DBL(+, 1, a65c0b85ac1a9, -, 3), + HEX_DBL(+, 1, 22d78f0fa061a, -, 2), + HEX_DBL(+, 1, 77a45d8117fd5, -, 2), + HEX_DBL(+, 1, d1e944f6fbdaa, -, 2), + HEX_DBL(+, 1, 190048ef6002, -, 1), + HEX_DBL(+, 1, 4c2531c3c0d38, -, 1), + }; + f -= reduction[index]; // find expm1(f) @@ -1394,149 +1421,287 @@ double reference_expm1( double x ) return f; // precise answer for x near 1 // table of e**(i-150) - static const double exp_table[128+150+1] = - { - HEX_DBL( +, 1, 82e16284f5ec5, -, 217 ), HEX_DBL( +, 1, 06e9996332ba1, -, 215 ), - HEX_DBL( +, 1, 6555cb289e44b, -, 214 ), HEX_DBL( +, 1, e5ab364643354, -, 213 ), - HEX_DBL( +, 1, 4a0bd18e64df7, -, 211 ), HEX_DBL( +, 1, c094499cc578e, -, 210 ), - HEX_DBL( +, 1, 30d759323998c, -, 208 ), HEX_DBL( +, 1, 9e5278ab1d4cf, -, 207 ), - HEX_DBL( +, 1, 198fa3f30be25, -, 205 ), HEX_DBL( +, 1, 7eae636d6144e, -, 204 ), - HEX_DBL( +, 1, 040f1036f4863, -, 202 ), HEX_DBL( +, 1, 6174e477a895f, -, 201 ), - HEX_DBL( +, 1, e065b82dd95a, -, 200 ), HEX_DBL( +, 1, 4676be491d129, -, 198 ), - HEX_DBL( +, 1, bbb5da5f7c823, -, 197 ), HEX_DBL( +, 1, 2d884eef5fdcb, -, 195 ), - HEX_DBL( +, 1, 99d3397ab8371, -, 194 ), HEX_DBL( +, 1, 1681497ed15b3, -, 192 ), - HEX_DBL( +, 1, 7a870f597fdbd, -, 191 ), HEX_DBL( +, 1, 013c74edba307, -, 189 ), - HEX_DBL( +, 1, 5d9ec4ada7938, -, 188 ), HEX_DBL( +, 1, db2edfd20fa7c, -, 187 ), - HEX_DBL( +, 1, 42eb9f39afb0b, -, 185 ), HEX_DBL( +, 1, b6e4f282b43f4, -, 184 ), - HEX_DBL( +, 1, 2a42764857b19, -, 182 ), HEX_DBL( +, 1, 9560792d19314, -, 181 ), - HEX_DBL( +, 1, 137b6ce8e052c, -, 179 ), HEX_DBL( +, 1, 766b45dd84f18, -, 178 ), - HEX_DBL( +, 1, fce362fe6e7d, -, 177 ), HEX_DBL( +, 1, 59d34dd8a5473, -, 175 ), - HEX_DBL( +, 1, d606847fc727a, -, 174 ), HEX_DBL( +, 1, 3f6a58b795de3, -, 172 ), - HEX_DBL( +, 1, b2216c6efdac1, -, 171 ), HEX_DBL( +, 1, 2705b5b153fb8, -, 169 ), - HEX_DBL( +, 1, 90fa1509bd50d, -, 168 ), HEX_DBL( +, 1, 107df698da211, -, 166 ), - HEX_DBL( +, 1, 725ae6e7b9d35, -, 165 ), HEX_DBL( +, 1, f75d6040aeff6, -, 164 ), - HEX_DBL( +, 1, 56126259e093c, -, 162 ), HEX_DBL( +, 1, d0ec7df4f7bd4, -, 161 ), - HEX_DBL( +, 1, 3bf2cf6722e46, -, 159 ), HEX_DBL( +, 1, ad6b22f55db42, -, 158 ), - HEX_DBL( +, 1, 23d1f3e5834a, -, 156 ), HEX_DBL( +, 1, 8c9feab89b876, -, 155 ), - HEX_DBL( +, 1, 0d88cf37f00dd, -, 153 ), HEX_DBL( +, 1, 6e55d2bf838a7, -, 152 ), - HEX_DBL( +, 1, f1e6b68529e33, -, 151 ), HEX_DBL( +, 1, 525be4e4e601d, -, 149 ), - HEX_DBL( +, 1, cbe0a45f75eb1, -, 148 ), HEX_DBL( +, 1, 3884e838aea68, -, 146 ), - HEX_DBL( +, 1, a8c1f14e2af5d, -, 145 ), HEX_DBL( +, 1, 20a717e64a9bd, -, 143 ), - HEX_DBL( +, 1, 8851d84118908, -, 142 ), HEX_DBL( +, 1, 0a9bdfb02d24, -, 140 ), - HEX_DBL( +, 1, 6a5bea046b42e, -, 139 ), HEX_DBL( +, 1, ec7f3b269efa8, -, 138 ), - HEX_DBL( +, 1, 4eafb87eab0f2, -, 136 ), HEX_DBL( +, 1, c6e2d05bbc, -, 135 ), - HEX_DBL( +, 1, 35208867c2683, -, 133 ), HEX_DBL( +, 1, a425b317eeacd, -, 132 ), - HEX_DBL( +, 1, 1d8508fa8246a, -, 130 ), HEX_DBL( +, 1, 840fbc08fdc8a, -, 129 ), - HEX_DBL( +, 1, 07b7112bc1ffe, -, 127 ), HEX_DBL( +, 1, 666d0dad2961d, -, 126 ), - HEX_DBL( +, 1, e726c3f64d0fe, -, 125 ), HEX_DBL( +, 1, 4b0dc07cabf98, -, 123 ), - HEX_DBL( +, 1, c1f2daf3b6a46, -, 122 ), HEX_DBL( +, 1, 31c5957a47de2, -, 120 ), - HEX_DBL( +, 1, 9f96445648b9f, -, 119 ), HEX_DBL( +, 1, 1a6baeadb4fd1, -, 117 ), - HEX_DBL( +, 1, 7fd974d372e45, -, 116 ), HEX_DBL( +, 1, 04da4d1452919, -, 114 ), - HEX_DBL( +, 1, 62891f06b345, -, 113 ), HEX_DBL( +, 1, e1dd273aa8a4a, -, 112 ), - HEX_DBL( +, 1, 4775e0840bfdd, -, 110 ), HEX_DBL( +, 1, bd109d9d94bda, -, 109 ), - HEX_DBL( +, 1, 2e73f53fba844, -, 107 ), HEX_DBL( +, 1, 9b138170d6bfe, -, 106 ), - HEX_DBL( +, 1, 175af0cf60ec5, -, 104 ), HEX_DBL( +, 1, 7baee1bffa80b, -, 103 ), - HEX_DBL( +, 1, 02057d1245ceb, -, 101 ), HEX_DBL( +, 1, 5eafffb34ba31, -, 100 ), - HEX_DBL( +, 1, dca23bae16424, -, 99 ), HEX_DBL( +, 1, 43e7fc88b8056, -, 97 ), - HEX_DBL( +, 1, b83bf23a9a9eb, -, 96 ), HEX_DBL( +, 1, 2b2b8dd05b318, -, 94 ), - HEX_DBL( +, 1, 969d47321e4cc, -, 93 ), HEX_DBL( +, 1, 1452b7723aed2, -, 91 ), - HEX_DBL( +, 1, 778fe2497184c, -, 90 ), HEX_DBL( +, 1, fe7116182e9cc, -, 89 ), - HEX_DBL( +, 1, 5ae191a99585a, -, 87 ), HEX_DBL( +, 1, d775d87da854d, -, 86 ), - HEX_DBL( +, 1, 4063f8cc8bb98, -, 84 ), HEX_DBL( +, 1, b374b315f87c1, -, 83 ), - HEX_DBL( +, 1, 27ec458c65e3c, -, 81 ), HEX_DBL( +, 1, 923372c67a074, -, 80 ), - HEX_DBL( +, 1, 1152eaeb73c08, -, 78 ), HEX_DBL( +, 1, 737c5645114b5, -, 77 ), - HEX_DBL( +, 1, f8e6c24b5592e, -, 76 ), HEX_DBL( +, 1, 571db733a9d61, -, 74 ), - HEX_DBL( +, 1, d257d547e083f, -, 73 ), HEX_DBL( +, 1, 3ce9b9de78f85, -, 71 ), - HEX_DBL( +, 1, aebabae3a41b5, -, 70 ), HEX_DBL( +, 1, 24b6031b49bda, -, 68 ), - HEX_DBL( +, 1, 8dd5e1bb09d7e, -, 67 ), HEX_DBL( +, 1, 0e5b73d1ff53d, -, 65 ), - HEX_DBL( +, 1, 6f741de1748ec, -, 64 ), HEX_DBL( +, 1, f36bd37f42f3e, -, 63 ), - HEX_DBL( +, 1, 536452ee2f75c, -, 61 ), HEX_DBL( +, 1, cd480a1b7482, -, 60 ), - HEX_DBL( +, 1, 39792499b1a24, -, 58 ), HEX_DBL( +, 1, aa0de4bf35b38, -, 57 ), - HEX_DBL( +, 1, 2188ad6ae3303, -, 55 ), HEX_DBL( +, 1, 898471fca6055, -, 54 ), - HEX_DBL( +, 1, 0b6c3afdde064, -, 52 ), HEX_DBL( +, 1, 6b7719a59f0e, -, 51 ), - HEX_DBL( +, 1, ee001eed62aa, -, 50 ), HEX_DBL( +, 1, 4fb547c775da8, -, 48 ), - HEX_DBL( +, 1, c8464f7616468, -, 47 ), HEX_DBL( +, 1, 36121e24d3bba, -, 45 ), - HEX_DBL( +, 1, a56e0c2ac7f75, -, 44 ), HEX_DBL( +, 1, 1e642baeb84a, -, 42 ), - HEX_DBL( +, 1, 853f01d6d53ba, -, 41 ), HEX_DBL( +, 1, 0885298767e9a, -, 39 ), - HEX_DBL( +, 1, 67852a7007e42, -, 38 ), HEX_DBL( +, 1, e8a37a45fc32e, -, 37 ), - HEX_DBL( +, 1, 4c1078fe9228a, -, 35 ), HEX_DBL( +, 1, c3527e433fab1, -, 34 ), - HEX_DBL( +, 1, 32b48bf117da2, -, 32 ), HEX_DBL( +, 1, a0db0d0ddb3ec, -, 31 ), - HEX_DBL( +, 1, 1b48655f37267, -, 29 ), HEX_DBL( +, 1, 81056ff2c5772, -, 28 ), - HEX_DBL( +, 1, 05a628c699fa1, -, 26 ), HEX_DBL( +, 1, 639e3175a689d, -, 25 ), - HEX_DBL( +, 1, e355bbaee85cb, -, 24 ), HEX_DBL( +, 1, 4875ca227ec38, -, 22 ), - HEX_DBL( +, 1, be6c6fdb01612, -, 21 ), HEX_DBL( +, 1, 2f6053b981d98, -, 19 ), - HEX_DBL( +, 1, 9c54c3b43bc8b, -, 18 ), HEX_DBL( +, 1, 18354238f6764, -, 16 ), - HEX_DBL( +, 1, 7cd79b5647c9b, -, 15 ), HEX_DBL( +, 1, 02cf22526545a, -, 13 ), - HEX_DBL( +, 1, 5fc21041027ad, -, 12 ), HEX_DBL( +, 1, de16b9c24a98f, -, 11 ), - HEX_DBL( +, 1, 44e51f113d4d6, -, 9 ), HEX_DBL( +, 1, b993fe00d5376, -, 8 ), - HEX_DBL( +, 1, 2c155b8213cf4, -, 6 ), HEX_DBL( +, 1, 97db0ccceb0af, -, 5 ), - HEX_DBL( +, 1, 152aaa3bf81cc, -, 3 ), HEX_DBL( +, 1, 78b56362cef38, -, 2 ), - HEX_DBL( +, 1, 0, +, 0 ), HEX_DBL( +, 1, 5bf0a8b145769, +, 1 ), - HEX_DBL( +, 1, d8e64b8d4ddae, +, 2 ), HEX_DBL( +, 1, 415e5bf6fb106, +, 4 ), - HEX_DBL( +, 1, b4c902e273a58, +, 5 ), HEX_DBL( +, 1, 28d389970338f, +, 7 ), - HEX_DBL( +, 1, 936dc5690c08f, +, 8 ), HEX_DBL( +, 1, 122885aaeddaa, +, 10 ), - HEX_DBL( +, 1, 749ea7d470c6e, +, 11 ), HEX_DBL( +, 1, fa7157c470f82, +, 12 ), - HEX_DBL( +, 1, 5829dcf95056, +, 14 ), HEX_DBL( +, 1, d3c4488ee4f7f, +, 15 ), - HEX_DBL( +, 1, 3de1654d37c9a, +, 17 ), HEX_DBL( +, 1, b00b5916ac955, +, 18 ), - HEX_DBL( +, 1, 259ac48bf05d7, +, 20 ), HEX_DBL( +, 1, 8f0ccafad2a87, +, 21 ), - HEX_DBL( +, 1, 0f2ebd0a8002, +, 23 ), HEX_DBL( +, 1, 709348c0ea4f9, +, 24 ), - HEX_DBL( +, 1, f4f22091940bd, +, 25 ), HEX_DBL( +, 1, 546d8f9ed26e1, +, 27 ), - HEX_DBL( +, 1, ceb088b68e804, +, 28 ), HEX_DBL( +, 1, 3a6e1fd9eecfd, +, 30 ), - HEX_DBL( +, 1, ab5adb9c436, +, 31 ), HEX_DBL( +, 1, 226af33b1fdc1, +, 33 ), - HEX_DBL( +, 1, 8ab7fb5475fb7, +, 34 ), HEX_DBL( +, 1, 0c3d3920962c9, +, 36 ), - HEX_DBL( +, 1, 6c932696a6b5d, +, 37 ), HEX_DBL( +, 1, ef822f7f6731d, +, 38 ), - HEX_DBL( +, 1, 50bba3796379a, +, 40 ), HEX_DBL( +, 1, c9aae4631c056, +, 41 ), - HEX_DBL( +, 1, 370470aec28ed, +, 43 ), HEX_DBL( +, 1, a6b765d8cdf6d, +, 44 ), - HEX_DBL( +, 1, 1f43fcc4b662c, +, 46 ), HEX_DBL( +, 1, 866f34a725782, +, 47 ), - HEX_DBL( +, 1, 0953e2f3a1ef7, +, 49 ), HEX_DBL( +, 1, 689e221bc8d5b, +, 50 ), - HEX_DBL( +, 1, ea215a1d20d76, +, 51 ), HEX_DBL( +, 1, 4d13fbb1a001a, +, 53 ), - HEX_DBL( +, 1, c4b334617cc67, +, 54 ), HEX_DBL( +, 1, 33a43d282a519, +, 56 ), - HEX_DBL( +, 1, a220d397972eb, +, 57 ), HEX_DBL( +, 1, 1c25c88df6862, +, 59 ), - HEX_DBL( +, 1, 8232558201159, +, 60 ), HEX_DBL( +, 1, 0672a3c9eb871, +, 62 ), - HEX_DBL( +, 1, 64b41c6d37832, +, 63 ), HEX_DBL( +, 1, e4cf766fe49be, +, 64 ), - HEX_DBL( +, 1, 49767bc0483e3, +, 66 ), HEX_DBL( +, 1, bfc951eb8bb76, +, 67 ), - HEX_DBL( +, 1, 304d6aeca254b, +, 69 ), HEX_DBL( +, 1, 9d97010884251, +, 70 ), - HEX_DBL( +, 1, 19103e4080b45, +, 72 ), HEX_DBL( +, 1, 7e013cd114461, +, 73 ), - HEX_DBL( +, 1, 03996528e074c, +, 75 ), HEX_DBL( +, 1, 60d4f6fdac731, +, 76 ), - HEX_DBL( +, 1, df8c5af17ba3b, +, 77 ), HEX_DBL( +, 1, 45e3076d61699, +, 79 ), - HEX_DBL( +, 1, baed16a6e0da7, +, 80 ), HEX_DBL( +, 1, 2cffdfebde1a1, +, 82 ), - HEX_DBL( +, 1, 9919cabefcb69, +, 83 ), HEX_DBL( +, 1, 160345c9953e3, +, 85 ), - HEX_DBL( +, 1, 79dbc9dc53c66, +, 86 ), HEX_DBL( +, 1, 00c810d464097, +, 88 ), - HEX_DBL( +, 1, 5d009394c5c27, +, 89 ), HEX_DBL( +, 1, da57de8f107a8, +, 90 ), - HEX_DBL( +, 1, 425982cf597cd, +, 92 ), HEX_DBL( +, 1, b61e5ca3a5e31, +, 93 ), - HEX_DBL( +, 1, 29bb825dfcf87, +, 95 ), HEX_DBL( +, 1, 94a90db0d6fe2, +, 96 ), - HEX_DBL( +, 1, 12fec759586fd, +, 98 ), HEX_DBL( +, 1, 75c1dc469e3af, +, 99 ), - HEX_DBL( +, 1, fbfd219c43b04, +, 100 ), HEX_DBL( +, 1, 5936d44e1a146, +, 102 ), - HEX_DBL( +, 1, d531d8a7ee79c, +, 103 ), HEX_DBL( +, 1, 3ed9d24a2d51b, +, 105 ), - HEX_DBL( +, 1, b15cfe5b6e17b, +, 106 ), HEX_DBL( +, 1, 268038c2c0e, +, 108 ), - HEX_DBL( +, 1, 9044a73545d48, +, 109 ), HEX_DBL( +, 1, 1002ab6218b38, +, 111 ), - HEX_DBL( +, 1, 71b3540cbf921, +, 112 ), HEX_DBL( +, 1, f6799ea9c414a, +, 113 ), - HEX_DBL( +, 1, 55779b984f3eb, +, 115 ), HEX_DBL( +, 1, d01a210c44aa4, +, 116 ), - HEX_DBL( +, 1, 3b63da8e9121, +, 118 ), HEX_DBL( +, 1, aca8d6b0116b8, +, 119 ), - HEX_DBL( +, 1, 234de9e0c74e9, +, 121 ), HEX_DBL( +, 1, 8bec7503ca477, +, 122 ), - HEX_DBL( +, 1, 0d0eda9796b9, +, 124 ), HEX_DBL( +, 1, 6db0118477245, +, 125 ), - HEX_DBL( +, 1, f1056dc7bf22d, +, 126 ), HEX_DBL( +, 1, 51c2cc3433801, +, 128 ), - HEX_DBL( +, 1, cb108ffbec164, +, 129 ), HEX_DBL( +, 1, 37f780991b584, +, 131 ), - HEX_DBL( +, 1, a801c0ea8ac4d, +, 132 ), HEX_DBL( +, 1, 20247cc4c46c1, +, 134 ), - HEX_DBL( +, 1, 87a0553328015, +, 135 ), HEX_DBL( +, 1, 0a233dee4f9bb, +, 137 ), - HEX_DBL( +, 1, 69b7f55b808ba, +, 138 ), HEX_DBL( +, 1, eba064644060a, +, 139 ), - HEX_DBL( +, 1, 4e184933d9364, +, 141 ), HEX_DBL( +, 1, c614fe2531841, +, 142 ), - HEX_DBL( +, 1, 3494a9b171bf5, +, 144 ), HEX_DBL( +, 1, a36798b9d969b, +, 145 ), - HEX_DBL( +, 1, 1d03d8c0c04af, +, 147 ), HEX_DBL( +, 1, 836026385c974, +, 148 ), - HEX_DBL( +, 1, 073fbe9ac901d, +, 150 ), HEX_DBL( +, 1, 65cae0969f286, +, 151 ), - HEX_DBL( +, 1, e64a58639cae8, +, 152 ), HEX_DBL( +, 1, 4a77f5f9b50f9, +, 154 ), - HEX_DBL( +, 1, c12744a3a28e3, +, 155 ), HEX_DBL( +, 1, 313b3b6978e85, +, 157 ), - HEX_DBL( +, 1, 9eda3a31e587e, +, 158 ), HEX_DBL( +, 1, 19ebe56b56453, +, 160 ), - HEX_DBL( +, 1, 7f2bc6e599b7e, +, 161 ), HEX_DBL( +, 1, 04644610df2ff, +, 163 ), - HEX_DBL( +, 1, 61e8b490ac4e6, +, 164 ), HEX_DBL( +, 1, e103201f299b3, +, 165 ), - HEX_DBL( +, 1, 46e1b637beaf5, +, 167 ), HEX_DBL( +, 1, bc473cfede104, +, 168 ), - HEX_DBL( +, 1, 2deb1b9c85e2d, +, 170 ), HEX_DBL( +, 1, 9a5981ca67d1, +, 171 ), - HEX_DBL( +, 1, 16dc8a9ef670b, +, 173 ), HEX_DBL( +, 1, 7b03166942309, +, 174 ), - HEX_DBL( +, 1, 0190be03150a7, +, 176 ), HEX_DBL( +, 1, 5e1152f9a8119, +, 177 ), - HEX_DBL( +, 1, dbca9263f8487, +, 178 ), HEX_DBL( +, 1, 43556dee93bee, +, 180 ), - HEX_DBL( +, 1, b774c12967dfa, +, 181 ), HEX_DBL( +, 1, 2aa4306e922c2, +, 183 ), - HEX_DBL( +, 1, 95e54c5dd4217, +, 184 ) }; - + static const double exp_table[128 + 150 + 1] = { + HEX_DBL(+, 1, 82e16284f5ec5, -, 217), + HEX_DBL(+, 1, 06e9996332ba1, -, 215), + HEX_DBL(+, 1, 6555cb289e44b, -, 214), + HEX_DBL(+, 1, e5ab364643354, -, 213), + HEX_DBL(+, 1, 4a0bd18e64df7, -, 211), + HEX_DBL(+, 1, c094499cc578e, -, 210), + HEX_DBL(+, 1, 30d759323998c, -, 208), + HEX_DBL(+, 1, 9e5278ab1d4cf, -, 207), + HEX_DBL(+, 1, 198fa3f30be25, -, 205), + HEX_DBL(+, 1, 7eae636d6144e, -, 204), + HEX_DBL(+, 1, 040f1036f4863, -, 202), + HEX_DBL(+, 1, 6174e477a895f, -, 201), + HEX_DBL(+, 1, e065b82dd95a, -, 200), + HEX_DBL(+, 1, 4676be491d129, -, 198), + HEX_DBL(+, 1, bbb5da5f7c823, -, 197), + HEX_DBL(+, 1, 2d884eef5fdcb, -, 195), + HEX_DBL(+, 1, 99d3397ab8371, -, 194), + HEX_DBL(+, 1, 1681497ed15b3, -, 192), + HEX_DBL(+, 1, 7a870f597fdbd, -, 191), + HEX_DBL(+, 1, 013c74edba307, -, 189), + HEX_DBL(+, 1, 5d9ec4ada7938, -, 188), + HEX_DBL(+, 1, db2edfd20fa7c, -, 187), + HEX_DBL(+, 1, 42eb9f39afb0b, -, 185), + HEX_DBL(+, 1, b6e4f282b43f4, -, 184), + HEX_DBL(+, 1, 2a42764857b19, -, 182), + HEX_DBL(+, 1, 9560792d19314, -, 181), + HEX_DBL(+, 1, 137b6ce8e052c, -, 179), + HEX_DBL(+, 1, 766b45dd84f18, -, 178), + HEX_DBL(+, 1, fce362fe6e7d, -, 177), + HEX_DBL(+, 1, 59d34dd8a5473, -, 175), + HEX_DBL(+, 1, d606847fc727a, -, 174), + HEX_DBL(+, 1, 3f6a58b795de3, -, 172), + HEX_DBL(+, 1, b2216c6efdac1, -, 171), + HEX_DBL(+, 1, 2705b5b153fb8, -, 169), + HEX_DBL(+, 1, 90fa1509bd50d, -, 168), + HEX_DBL(+, 1, 107df698da211, -, 166), + HEX_DBL(+, 1, 725ae6e7b9d35, -, 165), + HEX_DBL(+, 1, f75d6040aeff6, -, 164), + HEX_DBL(+, 1, 56126259e093c, -, 162), + HEX_DBL(+, 1, d0ec7df4f7bd4, -, 161), + HEX_DBL(+, 1, 3bf2cf6722e46, -, 159), + HEX_DBL(+, 1, ad6b22f55db42, -, 158), + HEX_DBL(+, 1, 23d1f3e5834a, -, 156), + HEX_DBL(+, 1, 8c9feab89b876, -, 155), + HEX_DBL(+, 1, 0d88cf37f00dd, -, 153), + HEX_DBL(+, 1, 6e55d2bf838a7, -, 152), + HEX_DBL(+, 1, f1e6b68529e33, -, 151), + HEX_DBL(+, 1, 525be4e4e601d, -, 149), + HEX_DBL(+, 1, cbe0a45f75eb1, -, 148), + HEX_DBL(+, 1, 3884e838aea68, -, 146), + HEX_DBL(+, 1, a8c1f14e2af5d, -, 145), + HEX_DBL(+, 1, 20a717e64a9bd, -, 143), + HEX_DBL(+, 1, 8851d84118908, -, 142), + HEX_DBL(+, 1, 0a9bdfb02d24, -, 140), + HEX_DBL(+, 1, 6a5bea046b42e, -, 139), + HEX_DBL(+, 1, ec7f3b269efa8, -, 138), + HEX_DBL(+, 1, 4eafb87eab0f2, -, 136), + HEX_DBL(+, 1, c6e2d05bbc, -, 135), + HEX_DBL(+, 1, 35208867c2683, -, 133), + HEX_DBL(+, 1, a425b317eeacd, -, 132), + HEX_DBL(+, 1, 1d8508fa8246a, -, 130), + HEX_DBL(+, 1, 840fbc08fdc8a, -, 129), + HEX_DBL(+, 1, 07b7112bc1ffe, -, 127), + HEX_DBL(+, 1, 666d0dad2961d, -, 126), + HEX_DBL(+, 1, e726c3f64d0fe, -, 125), + HEX_DBL(+, 1, 4b0dc07cabf98, -, 123), + HEX_DBL(+, 1, c1f2daf3b6a46, -, 122), + HEX_DBL(+, 1, 31c5957a47de2, -, 120), + HEX_DBL(+, 1, 9f96445648b9f, -, 119), + HEX_DBL(+, 1, 1a6baeadb4fd1, -, 117), + HEX_DBL(+, 1, 7fd974d372e45, -, 116), + HEX_DBL(+, 1, 04da4d1452919, -, 114), + HEX_DBL(+, 1, 62891f06b345, -, 113), + HEX_DBL(+, 1, e1dd273aa8a4a, -, 112), + HEX_DBL(+, 1, 4775e0840bfdd, -, 110), + HEX_DBL(+, 1, bd109d9d94bda, -, 109), + HEX_DBL(+, 1, 2e73f53fba844, -, 107), + HEX_DBL(+, 1, 9b138170d6bfe, -, 106), + HEX_DBL(+, 1, 175af0cf60ec5, -, 104), + HEX_DBL(+, 1, 7baee1bffa80b, -, 103), + HEX_DBL(+, 1, 02057d1245ceb, -, 101), + HEX_DBL(+, 1, 5eafffb34ba31, -, 100), + HEX_DBL(+, 1, dca23bae16424, -, 99), + HEX_DBL(+, 1, 43e7fc88b8056, -, 97), + HEX_DBL(+, 1, b83bf23a9a9eb, -, 96), + HEX_DBL(+, 1, 2b2b8dd05b318, -, 94), + HEX_DBL(+, 1, 969d47321e4cc, -, 93), + HEX_DBL(+, 1, 1452b7723aed2, -, 91), + HEX_DBL(+, 1, 778fe2497184c, -, 90), + HEX_DBL(+, 1, fe7116182e9cc, -, 89), + HEX_DBL(+, 1, 5ae191a99585a, -, 87), + HEX_DBL(+, 1, d775d87da854d, -, 86), + HEX_DBL(+, 1, 4063f8cc8bb98, -, 84), + HEX_DBL(+, 1, b374b315f87c1, -, 83), + HEX_DBL(+, 1, 27ec458c65e3c, -, 81), + HEX_DBL(+, 1, 923372c67a074, -, 80), + HEX_DBL(+, 1, 1152eaeb73c08, -, 78), + HEX_DBL(+, 1, 737c5645114b5, -, 77), + HEX_DBL(+, 1, f8e6c24b5592e, -, 76), + HEX_DBL(+, 1, 571db733a9d61, -, 74), + HEX_DBL(+, 1, d257d547e083f, -, 73), + HEX_DBL(+, 1, 3ce9b9de78f85, -, 71), + HEX_DBL(+, 1, aebabae3a41b5, -, 70), + HEX_DBL(+, 1, 24b6031b49bda, -, 68), + HEX_DBL(+, 1, 8dd5e1bb09d7e, -, 67), + HEX_DBL(+, 1, 0e5b73d1ff53d, -, 65), + HEX_DBL(+, 1, 6f741de1748ec, -, 64), + HEX_DBL(+, 1, f36bd37f42f3e, -, 63), + HEX_DBL(+, 1, 536452ee2f75c, -, 61), + HEX_DBL(+, 1, cd480a1b7482, -, 60), + HEX_DBL(+, 1, 39792499b1a24, -, 58), + HEX_DBL(+, 1, aa0de4bf35b38, -, 57), + HEX_DBL(+, 1, 2188ad6ae3303, -, 55), + HEX_DBL(+, 1, 898471fca6055, -, 54), + HEX_DBL(+, 1, 0b6c3afdde064, -, 52), + HEX_DBL(+, 1, 6b7719a59f0e, -, 51), + HEX_DBL(+, 1, ee001eed62aa, -, 50), + HEX_DBL(+, 1, 4fb547c775da8, -, 48), + HEX_DBL(+, 1, c8464f7616468, -, 47), + HEX_DBL(+, 1, 36121e24d3bba, -, 45), + HEX_DBL(+, 1, a56e0c2ac7f75, -, 44), + HEX_DBL(+, 1, 1e642baeb84a, -, 42), + HEX_DBL(+, 1, 853f01d6d53ba, -, 41), + HEX_DBL(+, 1, 0885298767e9a, -, 39), + HEX_DBL(+, 1, 67852a7007e42, -, 38), + HEX_DBL(+, 1, e8a37a45fc32e, -, 37), + HEX_DBL(+, 1, 4c1078fe9228a, -, 35), + HEX_DBL(+, 1, c3527e433fab1, -, 34), + HEX_DBL(+, 1, 32b48bf117da2, -, 32), + HEX_DBL(+, 1, a0db0d0ddb3ec, -, 31), + HEX_DBL(+, 1, 1b48655f37267, -, 29), + HEX_DBL(+, 1, 81056ff2c5772, -, 28), + HEX_DBL(+, 1, 05a628c699fa1, -, 26), + HEX_DBL(+, 1, 639e3175a689d, -, 25), + HEX_DBL(+, 1, e355bbaee85cb, -, 24), + HEX_DBL(+, 1, 4875ca227ec38, -, 22), + HEX_DBL(+, 1, be6c6fdb01612, -, 21), + HEX_DBL(+, 1, 2f6053b981d98, -, 19), + HEX_DBL(+, 1, 9c54c3b43bc8b, -, 18), + HEX_DBL(+, 1, 18354238f6764, -, 16), + HEX_DBL(+, 1, 7cd79b5647c9b, -, 15), + HEX_DBL(+, 1, 02cf22526545a, -, 13), + HEX_DBL(+, 1, 5fc21041027ad, -, 12), + HEX_DBL(+, 1, de16b9c24a98f, -, 11), + HEX_DBL(+, 1, 44e51f113d4d6, -, 9), + HEX_DBL(+, 1, b993fe00d5376, -, 8), + HEX_DBL(+, 1, 2c155b8213cf4, -, 6), + HEX_DBL(+, 1, 97db0ccceb0af, -, 5), + HEX_DBL(+, 1, 152aaa3bf81cc, -, 3), + HEX_DBL(+, 1, 78b56362cef38, -, 2), + HEX_DBL(+, 1, 0, +, 0), + HEX_DBL(+, 1, 5bf0a8b145769, +, 1), + HEX_DBL(+, 1, d8e64b8d4ddae, +, 2), + HEX_DBL(+, 1, 415e5bf6fb106, +, 4), + HEX_DBL(+, 1, b4c902e273a58, +, 5), + HEX_DBL(+, 1, 28d389970338f, +, 7), + HEX_DBL(+, 1, 936dc5690c08f, +, 8), + HEX_DBL(+, 1, 122885aaeddaa, +, 10), + HEX_DBL(+, 1, 749ea7d470c6e, +, 11), + HEX_DBL(+, 1, fa7157c470f82, +, 12), + HEX_DBL(+, 1, 5829dcf95056, +, 14), + HEX_DBL(+, 1, d3c4488ee4f7f, +, 15), + HEX_DBL(+, 1, 3de1654d37c9a, +, 17), + HEX_DBL(+, 1, b00b5916ac955, +, 18), + HEX_DBL(+, 1, 259ac48bf05d7, +, 20), + HEX_DBL(+, 1, 8f0ccafad2a87, +, 21), + HEX_DBL(+, 1, 0f2ebd0a8002, +, 23), + HEX_DBL(+, 1, 709348c0ea4f9, +, 24), + HEX_DBL(+, 1, f4f22091940bd, +, 25), + HEX_DBL(+, 1, 546d8f9ed26e1, +, 27), + HEX_DBL(+, 1, ceb088b68e804, +, 28), + HEX_DBL(+, 1, 3a6e1fd9eecfd, +, 30), + HEX_DBL(+, 1, ab5adb9c436, +, 31), + HEX_DBL(+, 1, 226af33b1fdc1, +, 33), + HEX_DBL(+, 1, 8ab7fb5475fb7, +, 34), + HEX_DBL(+, 1, 0c3d3920962c9, +, 36), + HEX_DBL(+, 1, 6c932696a6b5d, +, 37), + HEX_DBL(+, 1, ef822f7f6731d, +, 38), + HEX_DBL(+, 1, 50bba3796379a, +, 40), + HEX_DBL(+, 1, c9aae4631c056, +, 41), + HEX_DBL(+, 1, 370470aec28ed, +, 43), + HEX_DBL(+, 1, a6b765d8cdf6d, +, 44), + HEX_DBL(+, 1, 1f43fcc4b662c, +, 46), + HEX_DBL(+, 1, 866f34a725782, +, 47), + HEX_DBL(+, 1, 0953e2f3a1ef7, +, 49), + HEX_DBL(+, 1, 689e221bc8d5b, +, 50), + HEX_DBL(+, 1, ea215a1d20d76, +, 51), + HEX_DBL(+, 1, 4d13fbb1a001a, +, 53), + HEX_DBL(+, 1, c4b334617cc67, +, 54), + HEX_DBL(+, 1, 33a43d282a519, +, 56), + HEX_DBL(+, 1, a220d397972eb, +, 57), + HEX_DBL(+, 1, 1c25c88df6862, +, 59), + HEX_DBL(+, 1, 8232558201159, +, 60), + HEX_DBL(+, 1, 0672a3c9eb871, +, 62), + HEX_DBL(+, 1, 64b41c6d37832, +, 63), + HEX_DBL(+, 1, e4cf766fe49be, +, 64), + HEX_DBL(+, 1, 49767bc0483e3, +, 66), + HEX_DBL(+, 1, bfc951eb8bb76, +, 67), + HEX_DBL(+, 1, 304d6aeca254b, +, 69), + HEX_DBL(+, 1, 9d97010884251, +, 70), + HEX_DBL(+, 1, 19103e4080b45, +, 72), + HEX_DBL(+, 1, 7e013cd114461, +, 73), + HEX_DBL(+, 1, 03996528e074c, +, 75), + HEX_DBL(+, 1, 60d4f6fdac731, +, 76), + HEX_DBL(+, 1, df8c5af17ba3b, +, 77), + HEX_DBL(+, 1, 45e3076d61699, +, 79), + HEX_DBL(+, 1, baed16a6e0da7, +, 80), + HEX_DBL(+, 1, 2cffdfebde1a1, +, 82), + HEX_DBL(+, 1, 9919cabefcb69, +, 83), + HEX_DBL(+, 1, 160345c9953e3, +, 85), + HEX_DBL(+, 1, 79dbc9dc53c66, +, 86), + HEX_DBL(+, 1, 00c810d464097, +, 88), + HEX_DBL(+, 1, 5d009394c5c27, +, 89), + HEX_DBL(+, 1, da57de8f107a8, +, 90), + HEX_DBL(+, 1, 425982cf597cd, +, 92), + HEX_DBL(+, 1, b61e5ca3a5e31, +, 93), + HEX_DBL(+, 1, 29bb825dfcf87, +, 95), + HEX_DBL(+, 1, 94a90db0d6fe2, +, 96), + HEX_DBL(+, 1, 12fec759586fd, +, 98), + HEX_DBL(+, 1, 75c1dc469e3af, +, 99), + HEX_DBL(+, 1, fbfd219c43b04, +, 100), + HEX_DBL(+, 1, 5936d44e1a146, +, 102), + HEX_DBL(+, 1, d531d8a7ee79c, +, 103), + HEX_DBL(+, 1, 3ed9d24a2d51b, +, 105), + HEX_DBL(+, 1, b15cfe5b6e17b, +, 106), + HEX_DBL(+, 1, 268038c2c0e, +, 108), + HEX_DBL(+, 1, 9044a73545d48, +, 109), + HEX_DBL(+, 1, 1002ab6218b38, +, 111), + HEX_DBL(+, 1, 71b3540cbf921, +, 112), + HEX_DBL(+, 1, f6799ea9c414a, +, 113), + HEX_DBL(+, 1, 55779b984f3eb, +, 115), + HEX_DBL(+, 1, d01a210c44aa4, +, 116), + HEX_DBL(+, 1, 3b63da8e9121, +, 118), + HEX_DBL(+, 1, aca8d6b0116b8, +, 119), + HEX_DBL(+, 1, 234de9e0c74e9, +, 121), + HEX_DBL(+, 1, 8bec7503ca477, +, 122), + HEX_DBL(+, 1, 0d0eda9796b9, +, 124), + HEX_DBL(+, 1, 6db0118477245, +, 125), + HEX_DBL(+, 1, f1056dc7bf22d, +, 126), + HEX_DBL(+, 1, 51c2cc3433801, +, 128), + HEX_DBL(+, 1, cb108ffbec164, +, 129), + HEX_DBL(+, 1, 37f780991b584, +, 131), + HEX_DBL(+, 1, a801c0ea8ac4d, +, 132), + HEX_DBL(+, 1, 20247cc4c46c1, +, 134), + HEX_DBL(+, 1, 87a0553328015, +, 135), + HEX_DBL(+, 1, 0a233dee4f9bb, +, 137), + HEX_DBL(+, 1, 69b7f55b808ba, +, 138), + HEX_DBL(+, 1, eba064644060a, +, 139), + HEX_DBL(+, 1, 4e184933d9364, +, 141), + HEX_DBL(+, 1, c614fe2531841, +, 142), + HEX_DBL(+, 1, 3494a9b171bf5, +, 144), + HEX_DBL(+, 1, a36798b9d969b, +, 145), + HEX_DBL(+, 1, 1d03d8c0c04af, +, 147), + HEX_DBL(+, 1, 836026385c974, +, 148), + HEX_DBL(+, 1, 073fbe9ac901d, +, 150), + HEX_DBL(+, 1, 65cae0969f286, +, 151), + HEX_DBL(+, 1, e64a58639cae8, +, 152), + HEX_DBL(+, 1, 4a77f5f9b50f9, +, 154), + HEX_DBL(+, 1, c12744a3a28e3, +, 155), + HEX_DBL(+, 1, 313b3b6978e85, +, 157), + HEX_DBL(+, 1, 9eda3a31e587e, +, 158), + HEX_DBL(+, 1, 19ebe56b56453, +, 160), + HEX_DBL(+, 1, 7f2bc6e599b7e, +, 161), + HEX_DBL(+, 1, 04644610df2ff, +, 163), + HEX_DBL(+, 1, 61e8b490ac4e6, +, 164), + HEX_DBL(+, 1, e103201f299b3, +, 165), + HEX_DBL(+, 1, 46e1b637beaf5, +, 167), + HEX_DBL(+, 1, bc473cfede104, +, 168), + HEX_DBL(+, 1, 2deb1b9c85e2d, +, 170), + HEX_DBL(+, 1, 9a5981ca67d1, +, 171), + HEX_DBL(+, 1, 16dc8a9ef670b, +, 173), + HEX_DBL(+, 1, 7b03166942309, +, 174), + HEX_DBL(+, 1, 0190be03150a7, +, 176), + HEX_DBL(+, 1, 5e1152f9a8119, +, 177), + HEX_DBL(+, 1, dbca9263f8487, +, 178), + HEX_DBL(+, 1, 43556dee93bee, +, 180), + HEX_DBL(+, 1, b774c12967dfa, +, 181), + HEX_DBL(+, 1, 2aa4306e922c2, +, 183), + HEX_DBL(+, 1, 95e54c5dd4217, +, 184)}; + // scale by e**i -- (expm1(f) + 1)*e**i - 1 = expm1(f) * e**i + e**i - 1 = e**i return exp_table[exponent+150] + (f * exp_table[exponent+150] - 1.0); } @@ -1610,20 +1775,16 @@ double reference_relaxed_log2( double x ) return reference_log2(x); } -double reference_log2( double x ) -{ - if( isnan(x) || x < 0.0 || x == -INFINITY) - return cl_make_nan(); - - if( x == 0.0f) - return -INFINITY; - - if( x == INFINITY ) - return INFINITY; +double reference_log2(double x) { + if (isnan(x) || x < 0.0 || x == -INFINITY) return cl_make_nan(); + + if (x == 0.0f) return -INFINITY; + + if (x == INFINITY) return INFINITY; - double hi, lo; - __log2_ep( &hi, &lo, x ); - return hi; + double hi, lo; + __log2_ep(&hi, &lo, x); + return hi; } double reference_log1p( double x ) @@ -1839,97 +2000,123 @@ static const double //two52 = 4.50359962737049600000e+15, /* 0x43300000, 0x00000 w6 = -1.63092934096575273989e-03; /* 0xBF5AB89D, 0x0B9E43E4 */ static const double zero= 0.00000000000000000000e+00; - double t,y,z,nadj,p,p1,p2,p3,q,r,w; - cl_int i,hx,lx,ix; + double t, y, z, nadj, p, p1, p2, p3, q, r, w; + cl_int i, hx, lx, ix; union{ double d; cl_ulong u;}u; u.d = x; - hx = (cl_int) (u.u >> 32); - lx = (cl_int) (u.u & 0xffffffffULL); + hx = (cl_int)(u.u >> 32); + lx = (cl_int)(u.u & 0xffffffffULL); /* purge off +-inf, NaN, +-0, and negative arguments */ - ix = hx&0x7fffffff; - if(ix>=0x7ff00000) return x*x; - if((ix|lx)==0) return float_fail(); - if(ix<0x3b900000) { /* |x|<2**-70, return -log(|x|) */ - if(hx<0) { - return -reference_log(-x); - } else return -reference_log(x); - } - if(hx<0) { - if(ix>=0x43300000) /* |x|>=2**52, must be -integer */ - return float_fail(); - t = reference_sinpi(x); - if(t==zero) return float_fail(); /* -integer */ - nadj = reference_log(pi/reference_fabs(t*x)); - x = -x; - } + ix = hx & 0x7fffffff; + if (ix >= 0x7ff00000) return x * x; + if ((ix | lx) == 0) return float_fail(); + if (ix < 0x3b900000) { /* |x|<2**-70, return -log(|x|) */ + if (hx < 0) { + return -reference_log(-x); + } else + return -reference_log(x); + } + if (hx < 0) { + if (ix >= 0x43300000) /* |x|>=2**52, must be -integer */ + return float_fail(); + t = reference_sinpi(x); + if (t == zero) return float_fail(); /* -integer */ + nadj = reference_log(pi / reference_fabs(t * x)); + x = -x; + } /* purge off 1 and 2 */ - if((((ix-0x3ff00000)|lx)==0)||(((ix-0x40000000)|lx)==0)) r = 0; + if ((((ix - 0x3ff00000) | lx) == 0) || (((ix - 0x40000000) | lx) == 0)) + r = 0; /* for x < 2.0 */ - else if(ix<0x40000000) { - if(ix<=0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ - r = -reference_log(x); - if(ix>=0x3FE76944) {y = 1.0-x; i= 0;} - else if(ix>=0x3FCDA661) {y= x-(tc-one); i=1;} - else {y = x; i=2;} - } else { - r = zero; - if(ix>=0x3FFBB4C3) {y=2.0-x;i=0;} /* [1.7316,2] */ - else if(ix>=0x3FF3B4C4) {y=x-tc;i=1;} /* [1.23,1.73] */ - else {y=x-one;i=2;} - } - switch(i) { - case 0: - z = y*y; - p1 = a0+z*(a2+z*(a4+z*(a6+z*(a8+z*a10)))); - p2 = z*(a1+z*(a3+z*(a5+z*(a7+z*(a9+z*a11))))); - p = y*p1+p2; - r += (p-0.5*y); break; - case 1: - z = y*y; - w = z*y; - p1 = t0+w*(t3+w*(t6+w*(t9 +w*t12))); /* parallel comp */ - p2 = t1+w*(t4+w*(t7+w*(t10+w*t13))); - p3 = t2+w*(t5+w*(t8+w*(t11+w*t14))); - p = z*p1-(tt-w*(p2+y*p3)); - r += (tf + p); break; - case 2: - p1 = y*(u0+y*(u1+y*(u2+y*(u3+y*(u4+y*u5))))); - p2 = one+y*(v1+y*(v2+y*(v3+y*(v4+y*v5)))); - r += (-0.5*y + p1/p2); - } - } - else if(ix<0x40200000) { /* x < 8.0 */ - i = (int)x; - t = zero; - y = x-(double)i; - p = y*(s0+y*(s1+y*(s2+y*(s3+y*(s4+y*(s5+y*s6)))))); - q = one+y*(r1+y*(r2+y*(r3+y*(r4+y*(r5+y*r6))))); - r = half*y+p/q; - z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ - switch(i) { - case 7: z *= (y+6.0); /* FALLTHRU */ - case 6: z *= (y+5.0); /* FALLTHRU */ - case 5: z *= (y+4.0); /* FALLTHRU */ - case 4: z *= (y+3.0); /* FALLTHRU */ - case 3: z *= (y+2.0); /* FALLTHRU */ - r += reference_log(z); break; - } - /* 8.0 <= x < 2**58 */ - } else if (ix < 0x43900000) { - t = reference_log(x); - z = one/x; - y = z*z; - w = w0+z*(w1+y*(w2+y*(w3+y*(w4+y*(w5+y*w6))))); - r = (x-half)*(t-one)+w; - } else - /* 2**58 <= x <= inf */ - r = x*(reference_log(x)-one); - if(hx<0) r = nadj - r; - return r; - + else if (ix < 0x40000000) { + if (ix <= 0x3feccccc) { /* lgamma(x) = lgamma(x+1)-log(x) */ + r = -reference_log(x); + if (ix >= 0x3FE76944) { + y = 1.0 - x; + i = 0; + } else if (ix >= 0x3FCDA661) { + y = x - (tc - one); + i = 1; + } else { + y = x; + i = 2; + } + } else { + r = zero; + if (ix >= 0x3FFBB4C3) { + y = 2.0 - x; + i = 0; + } /* [1.7316,2] */ + else if (ix >= 0x3FF3B4C4) { + y = x - tc; + i = 1; + } /* [1.23,1.73] */ + else { + y = x - one; + i = 2; + } + } + switch (i) { + case 0: + z = y * y; + p1 = a0 + z * (a2 + z * (a4 + z * (a6 + z * (a8 + z * a10)))); + p2 = z * (a1 + z * (a3 + z * (a5 + z * (a7 + z * (a9 + z * a11))))); + p = y * p1 + p2; + r += (p - 0.5 * y); + break; + case 1: + z = y * y; + w = z * y; + p1 = + t0 + w * (t3 + w * (t6 + w * (t9 + w * t12))); /* parallel comp */ + p2 = t1 + w * (t4 + w * (t7 + w * (t10 + w * t13))); + p3 = t2 + w * (t5 + w * (t8 + w * (t11 + w * t14))); + p = z * p1 - (tt - w * (p2 + y * p3)); + r += (tf + p); + break; + case 2: + p1 = y * (u0 + y * (u1 + y * (u2 + y * (u3 + y * (u4 + y * u5))))); + p2 = one + y * (v1 + y * (v2 + y * (v3 + y * (v4 + y * v5)))); + r += (-0.5 * y + p1 / p2); + } + } else if (ix < 0x40200000) { /* x < 8.0 */ + i = (int)x; + t = zero; + y = x - (double)i; + p = y * + (s0 + y * (s1 + y * (s2 + y * (s3 + y * (s4 + y * (s5 + y * s6)))))); + q = one + y * (r1 + y * (r2 + y * (r3 + y * (r4 + y * (r5 + y * r6))))); + r = half * y + p / q; + z = one; /* lgamma(1+s) = log(s) + lgamma(s) */ + switch (i) { + case 7: + z *= (y + 6.0); /* FALLTHRU */ + case 6: + z *= (y + 5.0); /* FALLTHRU */ + case 5: + z *= (y + 4.0); /* FALLTHRU */ + case 4: + z *= (y + 3.0); /* FALLTHRU */ + case 3: + z *= (y + 2.0); /* FALLTHRU */ + r += reference_log(z); + break; + } + /* 8.0 <= x < 2**58 */ + } else if (ix < 0x43900000) { + t = reference_log(x); + z = one / x; + y = z * z; + w = w0 + z * (w1 + y * (w2 + y * (w3 + y * (w4 + y * (w5 + y * w6))))); + r = (x - half) * (t - one) + w; + } else + /* 2**58 <= x <= inf */ + r = x * (reference_log(x) - one); + if (hx < 0) r = nadj - r; + return r; } #endif // _MSC_VER @@ -2059,9 +2246,9 @@ long double reference_cospil( long double x) return reference_sinl( xhi ); #else - // phase adjust - x += 0.5L; - + // phase adjust + x += 0.5L; + // reduce to [-0.5, 0.5] if( x < -0.5L ) x = -1.0L - x; @@ -2830,144 +3017,140 @@ long double reference_sinpil( long double x) return reference_sinl( r * M_PIL ); } -long double reference_tanpil( long double x) -{ - // set aside the sign (allows us to preserve sign of -0) - long double sign = reference_copysignl( 1.0L, x); - long double z = reference_fabsl(x); +long double reference_tanpil(long double x) { + // set aside the sign (allows us to preserve sign of -0) + long double sign = reference_copysignl(1.0L, x); + long double z = reference_fabsl(x); - // if big and even -- caution: only works if x only has single precision - if( z >= HEX_LDBL( +, 1, 0, +, 53 ) ) - { - if( z == INFINITY ) - return x - x; // nan - - return reference_copysignl( 0.0L, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n. - } - - // reduce to the range [ -0.5, 0.5 ] - long double nearest = reference_rintl( z ); // round to nearest even places n + 0.5 values in the right place for us - int64_t i = (int64_t) nearest; // test above against 0x1.0p53 avoids overflow here - z -= nearest; - - //correction for odd integer x for the right sign of zero - if( (i&1) && z == 0.0L ) - sign = -sign; - - // track changes to the sign - sign *= reference_copysignl(1.0L, z); // really should just be an xor - z = reference_fabsl(z); // remove the sign again - - // reduce once more - // If we don't do this, rounding error in z * M_PI will cause us not to return infinities properly - if( z > 0.25L ) - { - z = 0.5L - z; - return sign / reference_tanl( z * M_PIL ); // use system tan to get the right result - } - - // - return sign * reference_tanl( z * M_PIL ); // use system tan to get the right result + // if big and even -- caution: only works if x only has single precision + if (z >= HEX_LDBL(+, 1, 0, +, 53)) { + if (z == INFINITY) return x - x; // nan + + return reference_copysignl( + 0.0L, x); // tanpi ( n ) is copysign( 0.0, n) for even integers n. + } + + // reduce to the range [ -0.5, 0.5 ] + long double nearest = + reference_rintl(z); // round to nearest even places n + 0.5 values in the + // right place for us + int64_t i = + (int64_t)nearest; // test above against 0x1.0p53 avoids overflow here + z -= nearest; + + // correction for odd integer x for the right sign of zero + if ((i & 1) && z == 0.0L) sign = -sign; + + // track changes to the sign + sign *= reference_copysignl(1.0L, z); // really should just be an xor + z = reference_fabsl(z); // remove the sign again + + // reduce once more + // If we don't do this, rounding error in z * M_PI will cause us not to return + // infinities properly + if (z > 0.25L) { + z = 0.5L - z; + return sign / + reference_tanl(z * M_PIL); // use system tan to get the right result + } + + // + return sign * + reference_tanl(z * M_PIL); // use system tan to get the right result } -long double reference_pownl( long double x, int i ){ return reference_powl( x, (long double) i ); } +long double reference_pownl(long double x, int i) { + return reference_powl(x, (long double)i); +} -long double reference_powrl( long double x, long double y ) -{ - //powr ( x, y ) returns NaN for x < 0. - if( x < 0.0L ) - return cl_make_nan(); +long double reference_powrl(long double x, long double y) { + // powr ( x, y ) returns NaN for x < 0. + if (x < 0.0L) return cl_make_nan(); - //powr ( x, NaN ) returns the NaN for x >= 0. - //powr ( NaN, y ) returns the NaN. - if( isnan(x) || isnan(y) ) - return x + y; // Note: behavior different here than for pow(1,NaN), pow(NaN, 0) + // powr ( x, NaN ) returns the NaN for x >= 0. + // powr ( NaN, y ) returns the NaN. + if (isnan(x) || isnan(y)) + return x + + y; // Note: behavior different here than for pow(1,NaN), pow(NaN, 0) - if( x == 1.0L ) - { - //powr ( +1, +-inf ) returns NaN. - if( reference_fabsl(y) == INFINITY ) - return cl_make_nan(); - - //powr ( +1, y ) is 1 for finite y. (NaN handled above) - return 1.0L; - } + if (x == 1.0L) { + // powr ( +1, +-inf ) returns NaN. + if (reference_fabsl(y) == INFINITY) return cl_make_nan(); - if( y == 0.0L ) - { - //powr ( +inf, +-0 ) returns NaN. - //powr ( +-0, +-0 ) returns NaN. - if( x == 0.0L || x == INFINITY ) - return cl_make_nan(); - - //powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already handled above) - return 1.0L; - } - - if( x == 0.0L ) - { - //powr ( +-0, -inf) is +inf. - //powr ( +-0, y ) is +inf for finite y < 0. - if( y < 0.0L ) - return INFINITY; - - //powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) - return 0.0L; - } - - return reference_powl( x, y ); + // powr ( +1, y ) is 1 for finite y. (NaN handled above) + return 1.0L; + } + + if (y == 0.0L) { + // powr ( +inf, +-0 ) returns NaN. + // powr ( +-0, +-0 ) returns NaN. + if (x == 0.0L || x == INFINITY) return cl_make_nan(); + + // powr ( x, +-0 ) is 1 for finite x > 0. (x <= 0, NaN, INF already handled + // above) + return 1.0L; + } + + if (x == 0.0L) { + // powr ( +-0, -inf) is +inf. + // powr ( +-0, y ) is +inf for finite y < 0. + if (y < 0.0L) return INFINITY; + + // powr ( +-0, y ) is +0 for y > 0. (NaN, y==0 handled above) + return 0.0L; + } + + return reference_powl(x, y); } -//long double my_fdiml( long double x, long double y){ return fdim( (double) x, (double) y ); } -long double reference_addl( long double x, long double y) -{ - volatile double a = (double) x; - volatile double b = (double) y; +// long double my_fdiml( long double x, long double y){ return fdim( (double) x, +// (double) y ); } +long double reference_addl(long double x, long double y) { + volatile double a = (double)x; + volatile double b = (double)y; -#if defined( __SSE2__ ) - // defeat x87 - __m128d va = _mm_set_sd( (double) a ); - __m128d vb = _mm_set_sd( (double) b ); - va = _mm_add_sd( va, vb ); - _mm_store_sd( (double*) &a, va ); +#if defined(__SSE2__) + // defeat x87 + __m128d va = _mm_set_sd((double)a); + __m128d vb = _mm_set_sd((double)b); + va = _mm_add_sd(va, vb); + _mm_store_sd((double*)&a, va); #else - a += b; + a += b; #endif - return (long double) a; + return (long double)a; } -long double reference_subtractl( long double x, long double y) -{ - volatile double a = (double) x; - volatile double b = (double) y; +long double reference_subtractl(long double x, long double y) { + volatile double a = (double)x; + volatile double b = (double)y; -#if defined( __SSE2__ ) - // defeat x87 - __m128d va = _mm_set_sd( (double) a ); - __m128d vb = _mm_set_sd( (double) b ); - va = _mm_sub_sd( va, vb ); - _mm_store_sd( (double*) &a, va ); +#if defined(__SSE2__) + // defeat x87 + __m128d va = _mm_set_sd((double)a); + __m128d vb = _mm_set_sd((double)b); + va = _mm_sub_sd(va, vb); + _mm_store_sd((double*)&a, va); #else - a -= b; + a -= b; #endif - return (long double) a; + return (long double)a; } -long double reference_multiplyl( long double x, long double y) -{ - volatile double a = (double) x; - volatile double b = (double) y; +long double reference_multiplyl(long double x, long double y) { + volatile double a = (double)x; + volatile double b = (double)y; -#if defined( __SSE2__ ) - // defeat x87 - __m128d va = _mm_set_sd( (double) a ); - __m128d vb = _mm_set_sd( (double) b ); - va = _mm_mul_sd( va, vb ); - _mm_store_sd( (double*) &a, va ); +#if defined(__SSE2__) + // defeat x87 + __m128d va = _mm_set_sd((double)a); + __m128d vb = _mm_set_sd((double)b); + va = _mm_mul_sd(va, vb); + _mm_store_sd((double*)&a, va); #else - a *= b; + a *= b; #endif - return (long double) a; + return (long double)a; } /*long double my_remquol( long double x, long double y, int *iptr ) @@ -2982,115 +3165,117 @@ long double reference_multiplyl( long double x, long double y) return remquo( (double) x, (double) y, iptr ); }*/ -long double reference_lgamma_rl( long double x, int *signp ) -{ -// long double lgamma_val = (long double)reference_lgamma( (double)x ); -// *signp = signgam; - *signp = 0; - return x; -} - - -int reference_isequall( long double x, long double y){ return x == y; } -int reference_isfinitel( long double x){ return 0 != isfinite(x); } -int reference_isgreaterl( long double x, long double y){ return x > y; } -int reference_isgreaterequall( long double x, long double y){ return x >= y; } -int reference_isinfl( long double x){ return 0 != isinf(x); } -int reference_islessl( long double x, long double y){ return x < y; } -int reference_islessequall( long double x, long double y){ return x <= y; } -int reference_islessgreaterl( long double x, long double y){ return 0 != islessgreater( x, y ); } -int reference_isnanl( long double x){ return 0 != isnan( x ); } -int reference_isnormall( long double x){ return 0 != isnormal( (double) x ); } -int reference_isnotequall( long double x, long double y){ return x != y; } -int reference_isorderedl( long double x, long double y){ return x == x && y == y; } -int reference_isunorderedl( long double x, long double y){ return isnan(x) || isnan( y ); } -int reference_signbitl( long double x){ return 0 != signbit( x ); } - -long double reference_copysignl( long double x, long double y); -long double reference_roundl( long double x ); +long double reference_lgamma_rl(long double x, int* signp) { + // long double lgamma_val = (long double)reference_lgamma( (double)x ); + // *signp = signgam; + *signp = 0; + return x; +} + +int reference_isequall(long double x, long double y) { return x == y; } +int reference_isfinitel(long double x) { return 0 != isfinite(x); } +int reference_isgreaterl(long double x, long double y) { return x > y; } +int reference_isgreaterequall(long double x, long double y) { return x >= y; } +int reference_isinfl(long double x) { return 0 != isinf(x); } +int reference_islessl(long double x, long double y) { return x < y; } +int reference_islessequall(long double x, long double y) { return x <= y; } +int reference_islessgreaterl(long double x, long double y) { + return 0 != islessgreater(x, y); +} +int reference_isnanl(long double x) { return 0 != isnan(x); } +int reference_isnormall(long double x) { return 0 != isnormal((double)x); } +int reference_isnotequall(long double x, long double y) { return x != y; } +int reference_isorderedl(long double x, long double y) { + return x == x && y == y; +} +int reference_isunorderedl(long double x, long double y) { + return isnan(x) || isnan(y); +} +int reference_signbitl(long double x) { return 0 != signbit(x); } + +long double reference_copysignl(long double x, long double y); +long double reference_roundl(long double x); long double reference_cbrtl(long double x); -long double reference_copysignl( long double x, long double y ) -{ - // We hope that the long double to double conversion proceeds with sign fidelity, - // even for zeros and NaNs - union{ double d; cl_ulong u;}u; u.d = (double) y; - - x = reference_fabsl(x); - if( u.u >> 63 ) - x = -x; - - return x; +long double reference_copysignl(long double x, long double y) { + // We hope that the long double to double conversion proceeds with sign + // fidelity, even for zeros and NaNs + union { + double d; + cl_ulong u; + } u; + u.d = (double)y; + + x = reference_fabsl(x); + if (u.u >> 63) x = -x; + + return x; } -long double reference_roundl( long double x ) -{ - // Since we are just using this to verify double precision, we can - // use the double precision copysign here +long double reference_roundl(long double x) { + // Since we are just using this to verify double precision, we can + // use the double precision copysign here #if defined(__MINGW32__) && defined(__x86_64__) - long double absx = reference_fabsl(x); - if (absx < 0.5L) - return reference_copysignl(0.0L, x); + long double absx = reference_fabsl(x); + if (absx < 0.5L) return reference_copysignl(0.0L, x); #endif - return round( (double) x ); + return round((double)x); } -long double reference_truncl( long double x ) -{ - // Since we are just using this to verify double precision, we can - // use the double precision copysign here - return trunc( (double) x ); +long double reference_truncl(long double x) { + // Since we are just using this to verify double precision, we can + // use the double precision copysign here + return trunc((double)x); } static long double reference_scalblnl(long double x, long n); -long double reference_cbrtl(long double x) -{ - double yhi = HEX_DBL( +, 1, 5555555555555, -, 2 ); - double ylo = HEX_DBL( +, 1, 558, -, 56 ); - - double fabsx = reference_fabs( x ); - - if( isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x) ) - return x; - - double iy = 0.0; - double log2x_hi, log2x_lo; - - // extended precision log .... accurate to at least 64-bits + couple of guard bits - __log2_ep(&log2x_hi, &log2x_lo, fabsx); - - double ylog2x_hi, ylog2x_lo; - - double y_hi = yhi; - double y_lo = ylo; - - // compute product of y*log2(x) - MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); - - long double powxy; - if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) { - powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY; - } else { - // separate integer + fractional part - long int m = lrint(ylog2x_hi); - AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0); - - // revert to long double arithemtic - long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo; - powxy = reference_exp2l( ylog2x ); - powxy = reference_scalblnl(powxy, m); - } - - return reference_copysignl( powxy, x ); +long double reference_cbrtl(long double x) { + double yhi = HEX_DBL(+, 1, 5555555555555, -, 2); + double ylo = HEX_DBL(+, 1, 558, -, 56); + + double fabsx = reference_fabs(x); + + if (isnan(x) || fabsx == 1.0 || fabsx == 0.0 || isinf(x)) return x; + + double iy = 0.0; + double log2x_hi, log2x_lo; + + // extended precision log .... accurate to at least 64-bits + couple of guard + // bits + __log2_ep(&log2x_hi, &log2x_lo, fabsx); + + double ylog2x_hi, ylog2x_lo; + + double y_hi = yhi; + double y_lo = ylo; + + // compute product of y*log2(x) + MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); + + long double powxy; + if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) { + powxy = reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY; + } else { + // separate integer + fractional part + long int m = lrint(ylog2x_hi); + AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0); + + // revert to long double arithemtic + long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo; + powxy = reference_exp2l(ylog2x); + powxy = reference_scalblnl(powxy, m); + } + + return reference_copysignl(powxy, x); } /* long double scalbnl( long double x, int i ) { //suitable for checking double precision scalbn only - + if( i > 3000 ) return copysignl( INFINITY, x); if( i < -3000 ) @@ -3103,7 +3288,7 @@ long double scalbnl( long double x, int i ) x *= HEX_LDBL( +, 1, 0, +, 1000 ); i -= 1000; } - + union{ cl_ulong u; double d;}u; u.u = (cl_ulong)( i + 1023 ) << 52; x *= (long double) u.d; @@ -3115,157 +3300,141 @@ long double scalbnl( long double x, int i ) x *= HEX_LDBL( +, 1, 0, -, 1000 ); i += 1000; } - + union{ cl_ulong u; double d;}u; u.u = (cl_ulong)( i + 1023 ) << 52; x *= (long double) u.d; } - + return x; } */ -long double reference_rintl( long double x ) -{ +long double reference_rintl(long double x) { #if defined(__PPC__) - // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined + // On PPC, long doubles are maintained as 2 doubles. Therefore, the combined // mantissa can represent more than LDBL_MANT_DIG binary digits. x = rintl(x); #else - static long double magic[2] = { 0.0L, 0.0L}; - - if( 0.0L == magic[0] ) - { - magic[0] = scalbnl(0.5L, LDBL_MANT_DIG); - magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG); - } + static long double magic[2] = {0.0L, 0.0L}; - if( reference_fabsl(x) < magic[0] && x != 0.0L ) - { - long double m = magic[ x < 0 ]; - x += m; - x -= m; - } -#endif // __PPC__ - return x; + if (0.0L == magic[0]) { + magic[0] = scalbnl(0.5L, LDBL_MANT_DIG); + magic[1] = scalbnl(-0.5L, LDBL_MANT_DIG); + } + + if (reference_fabsl(x) < magic[0] && x != 0.0L) { + long double m = magic[x < 0]; + x += m; + x -= m; + } +#endif // __PPC__ + return x; } // extended precision sqrt using newton iteration on 1/sqrt(x). // Final result is computed as x * 1/sqrt(x) -static void __sqrt_ep(double *rhi, double *rlo, double xhi, double xlo) -{ - // approximate reciprocal sqrt - double thi = 1.0 / sqrt( xhi ); - double tlo = 0.0; - - // One newton iteration in double-double - double yhi, ylo; - MulDD(&yhi, &ylo, thi, tlo, thi, tlo); - MulDD(&yhi, &ylo, yhi, ylo, xhi, xlo); - AddDD(&yhi, &ylo, -yhi, -ylo, 3.0, 0.0); - MulDD(&yhi, &ylo, yhi, ylo, thi, tlo); - MulDD(&yhi, &ylo, yhi, ylo, 0.5, 0.0); - - MulDD(rhi, rlo, yhi, ylo, xhi, xlo); -} - -long double reference_acoshl( long double x ) -{ -/* - * ==================================================== - * This function derived from fdlibm http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - * - */ - if( isnan(x) || isinf(x)) - return x + fabsl(x); - - if( x < 1.0L ) - return cl_make_nan(); - - if( x == 1.0L ) - return 0.0L; - - if( x > HEX_LDBL( +, 1, 0, +, 60 ) ) - return reference_logl(x) + 0.693147180559945309417232121458176568L; - - if( x > 2.0L ) - return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x*x - 1.0L))); - - double hi, lo; - MulD(&hi, &lo, x, x); - AddDD(&hi, &lo, hi, lo, -1.0, 0.0); - __sqrt_ep(&hi, &lo, hi, lo); - AddDD(&hi, &lo, hi, lo, x, 0.0); - double correction = lo / hi; - __log2_ep(&hi, &lo, hi); - double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ); - double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 ); - MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo); - AddDD(&hi, &lo, hi, lo, correction, 0.0); - - return hi + lo; +static void __sqrt_ep(double* rhi, double* rlo, double xhi, double xlo) { + // approximate reciprocal sqrt + double thi = 1.0 / sqrt(xhi); + double tlo = 0.0; + + // One newton iteration in double-double + double yhi, ylo; + MulDD(&yhi, &ylo, thi, tlo, thi, tlo); + MulDD(&yhi, &ylo, yhi, ylo, xhi, xlo); + AddDD(&yhi, &ylo, -yhi, -ylo, 3.0, 0.0); + MulDD(&yhi, &ylo, yhi, ylo, thi, tlo); + MulDD(&yhi, &ylo, yhi, ylo, 0.5, 0.0); + + MulDD(rhi, rlo, yhi, ylo, xhi, xlo); +} + +long double reference_acoshl(long double x) { + /* + * ==================================================== + * This function derived from fdlibm http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + * + */ + if (isnan(x) || isinf(x)) return x + fabsl(x); + + if (x < 1.0L) return cl_make_nan(); + + if (x == 1.0L) return 0.0L; + + if (x > HEX_LDBL(+, 1, 0, +, 60)) + return reference_logl(x) + 0.693147180559945309417232121458176568L; + + if (x > 2.0L) + return reference_logl(2.0L * x - 1.0L / (x + sqrtl(x * x - 1.0L))); + + double hi, lo; + MulD(&hi, &lo, x, x); + AddDD(&hi, &lo, hi, lo, -1.0, 0.0); + __sqrt_ep(&hi, &lo, hi, lo); + AddDD(&hi, &lo, hi, lo, x, 0.0); + double correction = lo / hi; + __log2_ep(&hi, &lo, hi); + double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1); + double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56); + MulDD(&hi, &lo, hi, lo, log2Hi, log2Lo); + AddDD(&hi, &lo, hi, lo, correction, 0.0); + + return hi + lo; +} + +long double reference_asinhl(long double x) { + long double cutoff = 0.0L; + const long double ln2 = HEX_LDBL(+, b, 17217f7d1cf79ab, -, 4); + + if (cutoff == 0.0L) cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG); + + if (isnan(x) || isinf(x)) return x + x; + + long double absx = reference_fabsl(x); + if (absx < cutoff) return x; + + long double sign = reference_copysignl(1.0L, x); + + if (absx <= 4.0 / 3.0) { + return sign * reference_log1pl(absx + x * x / (1.0 + sqrtl(1.0 + x * x))); + } else if (absx <= HEX_LDBL(+, 1, 0, +, 27)) { + return sign * + reference_logl(2.0L * absx + 1.0L / (sqrtl(x * x + 1.0) + absx)); + } else { + return sign * (reference_logl(absx) + ln2); + } } -long double reference_asinhl( long double x ) -{ - long double cutoff = 0.0L; - const long double ln2 = HEX_LDBL( +, b, 17217f7d1cf79ab, -, 4 ); - - if( cutoff == 0.0L ) - cutoff = reference_ldexpl(1.0L, -LDBL_MANT_DIG); - - if( isnan(x) || isinf(x) ) - return x + x; - - long double absx = reference_fabsl(x); - if( absx < cutoff ) - return x; +long double reference_atanhl(long double x) { + /* + * ==================================================== + * This function is from fdlibm: http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + if (isnan(x)) return x + x; - long double sign = reference_copysignl(1.0L, x); + long double signed_half = reference_copysignl(0.5L, x); + x = reference_fabsl(x); + if (x > 1.0L) return cl_make_nan(); - if( absx <= 4.0/3.0 ) { - return sign * reference_log1pl( absx + x*x / (1.0 + sqrtl(1.0 + x*x))); - } - else if( absx <= HEX_LDBL( +, 1, 0, +, 27 ) ) { - return sign * reference_logl( 2.0L * absx + 1.0L / (sqrtl( x * x + 1.0 ) + absx)); - } - else { - return sign * ( reference_logl( absx ) + ln2 ); - } -} + if (x < 0.5L) + return signed_half * reference_log1pl(2.0L * (x + x * x / (1 - x))); -long double reference_atanhl( long double x ) -{ -/* - * ==================================================== - * This function is from fdlibm: http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - if( isnan(x) ) - return x + x; - - long double signed_half = reference_copysignl( 0.5L, x ); - x = reference_fabsl(x); - if( x > 1.0L ) - return cl_make_nan(); - - if( x < 0.5L ) - return signed_half * reference_log1pl( 2.0L * ( x + x*x / (1-x) ) ); - - return signed_half * reference_log1pl(2.0L * x / (1-x)); + return signed_half * reference_log1pl(2.0L * x / (1 - x)); } long double reference_exp2l( long double z) @@ -3466,19 +3635,16 @@ long double reference_hypotl( long double x, long double y ) long double reference_log2l( long double x ) { - if( isnan(x) || x < 0.0 || x == -INFINITY) - return NAN; - - if( x == 0.0f) - return -INFINITY; - - if( x == INFINITY ) - return INFINITY; - - double hi, lo; - __log2_ep( &hi, &lo, x); - - return (long double) hi + (long double) lo; + if (isnan(x) || x < 0.0 || x == -INFINITY) return NAN; + + if (x == 0.0f) return -INFINITY; + + if (x == INFINITY) return INFINITY; + + double hi, lo; + __log2_ep(&hi, &lo, x); + + return (long double)hi + (long double)lo; } long double reference_log1pl( long double x) @@ -3574,21 +3740,24 @@ long double reference_lgammal( long double x) return reference_lgamma( x ); } -static uint32_t two_over_pi[] = { 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5, 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9, -0x6e3a424, 0x374b8019, 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121, 0xe99c702, 0x1ad17df9, -0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4, 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9, -0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7, 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, -0x3046fc7b, 0x1aeafc33, 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102, 0xdffd880, 0x135cc9cc, -0x10606155 -}; +static uint32_t two_over_pi[] = { + 0x0, 0x28be60db, 0x24e44152, 0x27f09d5f, 0x11f534dd, 0x3036d8a5, + 0x1993c439, 0x107f945, 0x23abdebb, 0x31586dc9, 0x6e3a424, 0x374b8019, + 0x92eea09, 0x3464873f, 0x21deb1cb, 0x4a69cfb, 0x288235f5, 0xbaed121, + 0xe99c702, 0x1ad17df9, 0x13991d6, 0xe60d4ce, 0x1f49c845, 0x3e2ef7e4, + 0x283b1ff8, 0x25fff781, 0x1980fef2, 0x3c462d68, 0xa6d1f6d, 0xd9fb3c9, + 0x3cb09b74, 0x3d18fd9a, 0x1e5fea2d, 0x1d49eeb1, 0x3ebe5f17, 0x2cf41ce7, + 0x378a5292, 0x3a9afed7, 0x3b11f8d5, 0x3421580c, 0x3046fc7b, 0x1aeafc33, + 0x3bc209af, 0x10d876a7, 0x2391615e, 0x3986c219, 0x199855f1, 0x1281a102, + 0xdffd880, 0x135cc9cc, 0x10606155}; -static uint32_t pi_over_two[] = { 0x1, 0x2487ed51, 0x42d1846, 0x26263314, 0x1701b839, 0x28948127 }; +static uint32_t pi_over_two[] = {0x1, 0x2487ed51, 0x42d1846, + 0x26263314, 0x1701b839, 0x28948127}; -typedef union - { - uint64_t u; - double d; - }d_ui64_t; +typedef union { + uint64_t u; + double d; +} d_ui64_t; // radix or base of representation #define RADIX (30) @@ -3604,41 +3773,40 @@ d_ui64_t two_pow_two_mradix = { (uint64_t) (1023-2*RADIX) << 52 }; // extended fixed point representation of double precision // floating point number. // x = sign * [ sum_{i = 0 to 2} ( X[i] * 2^(index - i)*RADIX ) ] -typedef struct - { - uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in base_30 - int index; // exponent bias - int sign; // sign of double - }eprep_t; +typedef struct { + uint32_t X[3]; // three 32 bit integers are sufficient to represnt double in + // base_30 + int index; // exponent bias + int sign; // sign of double +} eprep_t; static eprep_t double_to_eprep(double x); -static eprep_t double_to_eprep(double x) -{ - eprep_t result; - - result.sign = ( signbit( (float) x ) == 0) ? 1 : -1; - x = fabs( x ); - - int index = 0; - while( x > tp_pradix ) { - index++; - x *= tp_mradix; - } - while( x < 1 ) { - index--; - x *= tp_pradix; - } - - result.index = index; - int i = 0; - result.X[0] = result.X[1] = result.X[2] = 0; - while( x != 0.0 ) { - result.X[i] = (uint32_t) x; - x = (x - (double) result.X[i]) * tp_pradix; - i++; - } - return result; +static eprep_t double_to_eprep(double x) { + eprep_t result; + + result.sign = (signbit((float)x) == 0) ? 1 : -1; + x = fabs(x); + + int index = 0; + while (x > tp_pradix) { + index++; + x *= tp_mradix; + } + while (x < 1) { + index--; + x *= tp_pradix; + } + + result.index = index; + int i = 0; + result.X[0] = result.X[1] = result.X[2] = 0; + while (x != 0.0) { + result.X[i] = (uint32_t)x; + x = (x - (double)result.X[i]) * tp_pradix; + i++; + } + return result; } /* @@ -3648,26 +3816,26 @@ static eprep_t double_to_eprep(double x) uint64_t lowpart, roundbits, t1; int expo, expofinal, shift; double res; - - nb.d = (double) R[0]; - + + nb.d = (double) R[0]; + t1 = R[1]; lowpart = (t1 << RADIX) + R[2]; - expo = ((nb.u & 0x7ff0000000000000ULL) >> 52) - 1023; - + expo = ((nb.u & 0x7ff0000000000000ULL) >> 52) - 1023; + expofinal = expo + RADIX*index; - + if (expofinal > 1023) { d_ui64_t inf = { 0x7ff0000000000000ULL }; res = inf.d; } - - else if (expofinal >= -1022){ + + else if (expofinal >= -1022){ shift = expo + 2*RADIX - 53; roundbits = lowpart << (64-shift); - lowpart = lowpart >> shift; + lowpart = lowpart >> shift; if (lowpart & 0x0000000000000001ULL) { - if(roundbits == 0) { + if(roundbits == 0) { int i; for (i=3; i < digits; i++) roundbits = roundbits | R[i]; @@ -3678,46 +3846,46 @@ static eprep_t double_to_eprep(double x) else rndcorr.d = 0.0; } - else + else rndcorr.u = (uint64_t) (expo - 52 + 1023) << 52; } else{ rndcorr.d = 0.0; } - + lowpart = lowpart >> 1; - nb.u = nb.u | lowpart; - res = nb.d + rndcorr.d; - + nb.u = nb.u | lowpart; + res = nb.d + rndcorr.d; + if(index*RADIX + 1023 > 0) { nb.u = 0; - nb.u = (uint64_t) (index*RADIX + 1023) << 52; + nb.u = (uint64_t) (index*RADIX + 1023) << 52; res *= nb.d; } - else { + else { nb.u = 0; - nb.u = (uint64_t) (index*RADIX + 1023 + 2*RADIX) << 52; - res *= two_pow_two_mradix.d; - res *= nb.d; + nb.u = (uint64_t) (index*RADIX + 1023 + 2*RADIX) << 52; + res *= two_pow_two_mradix.d; + res *= nb.d; } - } - else { + } + else { if (expofinal < -1022 - 53 ) { res = 0.0; } else { - lowpart = lowpart >> (expo + (2*RADIX) - 52); - nb.u = nb.u | lowpart; + lowpart = lowpart >> (expo + (2*RADIX) - 52); + nb.u = nb.u | lowpart; nb.u = (nb.u & 0x000FFFFFFFFFFFFFULL) | 0x0010000000000000ULL; nb.u = nb.u >> (-1023 - expofinal); if(nb.u & 0x0000000000000001ULL) rndcorr.u = 1; else rndcorr.d = 0.0; - res = 0.5*(nb.d + rndcorr.d); - } - } - + res = 0.5*(nb.d + rndcorr.d); + } + } + return sgn*res; } */ @@ -3725,173 +3893,180 @@ static double eprep_to_double( eprep_t epx ); static double eprep_to_double( eprep_t epx ) { - double res = 0.0; - - res += ldexp((double) epx.X[0], (epx.index - 0)*RADIX); - res += ldexp((double) epx.X[1], (epx.index - 1)*RADIX); - res += ldexp((double) epx.X[2], (epx.index - 2)*RADIX); - - return copysign(res, epx.sign); + double res = 0.0; + + res += ldexp((double)epx.X[0], (epx.index - 0) * RADIX); + res += ldexp((double)epx.X[1], (epx.index - 1) * RADIX); + res += ldexp((double)epx.X[2], (epx.index - 2) * RADIX); + + return copysign(res, epx.sign); } static int payne_hanek( double *y, int *exception ); -static int payne_hanek( double *y, int *exception ) -{ - double x = *y; - - // exception cases .. no reduction required - if( isnan( x ) || isinf( x ) || (fabs( x ) <= M_PI_4) ) { - *exception = 1; - return 0; - } - - *exception = 0; - - // After computation result[0] contains integer part while result[1]....result[DIGITS-1] - // contain fractional part. So we are doing computation with (DIGITS-1)*RADIX precision. - // Default DIGITS=6 and RADIX=30 so default precision is 150 bits. Kahan-McDonald algorithm - // shows that a double precision x, closest to pi/2 is 6381956970095103 x 2^797 which can - // cause 61 digits of cancellation in computation of f = x*2/pi - floor(x*2/pi) ... thus we need - // at least 114 bits (61 leading zeros + 53 bits of mentissa of f) of precision to accurately compute - // f in double precision. Since we are using 150 bits (still an overkill), we should be safe. Extra - // bits can act as guard bits for correct rounding. - uint64_t result[DIGITS+2]; - - // compute extended precision representation of x - eprep_t epx = double_to_eprep( x ); - int index = epx.index; - int i, j; - // extended precision multiplication of 2/pi*x .... we will loose at max two RADIX=30 bit digits in - // the worst case - for(i = 0; i < (DIGITS+2); i++) { - result[i] = 0; - result[i] += ((index + i - 0) >= 0) ? ((uint64_t) two_over_pi[index + i - 0] * (uint64_t) epx.X[0]) : 0; - result[i] += ((index + i - 1) >= 0) ? ((uint64_t) two_over_pi[index + i - 1] * (uint64_t) epx.X[1]) : 0; - result[i] += ((index + i - 2) >= 0) ? ((uint64_t) two_over_pi[index + i - 2] * (uint64_t) epx.X[2]) : 0; - } - - // Carry propagation. - uint64_t tmp; - for(i = DIGITS+2-1; i > 0; i--) { - tmp = result[i] >> RADIX; - result[i - 1] += tmp; - result[i] -= (tmp << RADIX); - } - - // we dont ned to normalize the integer part since only last two bits of this will be used - // subsequently algorithm which remain unaltered by this normalization. - // tmp = result[0] >> RADIX; - // result[0] -= (tmp << RADIX); - unsigned int N = (unsigned int) result[0]; - - // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that testing if the final - // x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent to testing, at this stage, if r[1] (the first fractional - // digit) is greater than (2^RADIX)/2 and substracting pi/4 from x_star to bring it to mentioned - // range is equivalent to substracting fractional part at this stage from one and changing the sign. - int sign = 1; - if(result[1] > (uint64_t)(1 << (RADIX - 1))) { - for(i = 1; i < (DIGITS + 2); i++) - result[i] = (~((unsigned int)result[i]) & 0x3fffffff); - N += 1; - sign = -1; - } - - // Again as per Kahan-McDonald algorithim there may be 61 leading zeros in the worst case - // (when x is multiple of 2/pi very close to an integer) so we need to get rid of these zeros - // and adjust the index of final result. So in the worst case, precision of comupted result is - // 90 bits (150 bits original bits - 60 lost in cancellation). - int ind = 1; - for(i = 1; i < (DIGITS+2); i++) { - if(result[i] != 0) - break; - else - ind++; - } - - uint64_t r[DIGITS-1]; - for(i = 0; i < (DIGITS-1); i++) { - r[i] = 0; - for(j = 0; j <= i; j++) { - r[i] += (result[ind+i-j] * (uint64_t) pi_over_two[j]); - } - } - for(i = (DIGITS-2); i > 0; i--) { - tmp = r[i] >> RADIX; - r[i - 1] += tmp; - r[i] -= (tmp << RADIX); - } - tmp = r[0] >> RADIX; - r[0] -= (tmp << RADIX); - - eprep_t epr; - epr.sign = epx.sign*sign; - if(tmp != 0) { - epr.index = -ind + 1; - epr.X[0] = (uint32_t) tmp; - epr.X[1] = (uint32_t) r[0]; - epr.X[2] = (uint32_t) r[1]; - } - else { - epr.index = -ind; - epr.X[0] = (uint32_t) r[0]; - epr.X[1] = (uint32_t) r[1]; - epr.X[2] = (uint32_t) r[2]; - } - - *y = eprep_to_double( epr ); - return epx.sign*N; -} - -double reference_relaxed_cos(double x) -{ - if(isnan(x)) - return NAN; +static int payne_hanek(double* y, int* exception) { + double x = *y; + + // exception cases .. no reduction required + if (isnan(x) || isinf(x) || (fabs(x) <= M_PI_4)) { + *exception = 1; + return 0; + } + + *exception = 0; + + // After computation result[0] contains integer part while + // result[1]....result[DIGITS-1] contain fractional part. So we are + // doing computation with (DIGITS-1)*RADIX precision. Default DIGITS=6 + // and RADIX=30 so default precision is 150 bits. Kahan-McDonald + // algorithm shows that a double precision x, closest to pi/2 is + // 6381956970095103 x 2^797 which can cause 61 digits of cancellation in + // computation of f = x*2/pi - floor(x*2/pi) ... thus we need at least + // 114 bits (61 leading zeros + 53 bits of mentissa of f) of precision + // to accurately compute f in double precision. Since we are using 150 + // bits (still an overkill), we should be safe. Extra bits can act as + // guard bits for correct rounding. + uint64_t result[DIGITS + 2]; + + // compute extended precision representation of x + eprep_t epx = double_to_eprep(x); + int index = epx.index; + int i, j; + // extended precision multiplication of 2/pi*x .... we will loose at max + // two RADIX=30 bit digits in the worst case + for (i = 0; i < (DIGITS + 2); i++) { + result[i] = 0; + result[i] += + ((index + i - 0) >= 0) + ? ((uint64_t)two_over_pi[index + i - 0] * (uint64_t)epx.X[0]) + : 0; + result[i] += + ((index + i - 1) >= 0) + ? ((uint64_t)two_over_pi[index + i - 1] * (uint64_t)epx.X[1]) + : 0; + result[i] += + ((index + i - 2) >= 0) + ? ((uint64_t)two_over_pi[index + i - 2] * (uint64_t)epx.X[2]) + : 0; + } + + // Carry propagation. + uint64_t tmp; + for (i = DIGITS + 2 - 1; i > 0; i--) { + tmp = result[i] >> RADIX; + result[i - 1] += tmp; + result[i] -= (tmp << RADIX); + } + + // we dont ned to normalize the integer part since only last two bits of + // this will be used subsequently algorithm which remain unaltered by + // this normalization. tmp = result[0] >> RADIX; result[0] -= (tmp << + // RADIX); + unsigned int N = (unsigned int)result[0]; + + // if the result is > pi/4, bring it to (-pi/4, pi/4] range. Note that + // testing if the final x_star = pi/2*(x*2/pi - k) > pi/4 is equivalent + // to testing, at this stage, if r[1] (the first fractional digit) is + // greater than (2^RADIX)/2 and substracting pi/4 from x_star to bring + // it to mentioned range is equivalent to substracting fractional part + // at this stage from one and changing the sign. + int sign = 1; + if (result[1] > (uint64_t)(1 << (RADIX - 1))) { + for (i = 1; i < (DIGITS + 2); i++) + result[i] = (~((unsigned int)result[i]) & 0x3fffffff); + N += 1; + sign = -1; + } + + // Again as per Kahan-McDonald algorithim there may be 61 leading zeros + // in the worst case (when x is multiple of 2/pi very close to an + // integer) so we need to get rid of these zeros and adjust the index of + // final result. So in the worst case, precision of comupted result is + // 90 bits (150 bits original bits - 60 lost in cancellation). + int ind = 1; + for (i = 1; i < (DIGITS + 2); i++) { + if (result[i] != 0) + break; + else + ind++; + } + + uint64_t r[DIGITS - 1]; + for (i = 0; i < (DIGITS - 1); i++) { + r[i] = 0; + for (j = 0; j <= i; j++) { + r[i] += (result[ind + i - j] * (uint64_t)pi_over_two[j]); + } + } + for (i = (DIGITS - 2); i > 0; i--) { + tmp = r[i] >> RADIX; + r[i - 1] += tmp; + r[i] -= (tmp << RADIX); + } + tmp = r[0] >> RADIX; + r[0] -= (tmp << RADIX); + + eprep_t epr; + epr.sign = epx.sign * sign; + if (tmp != 0) { + epr.index = -ind + 1; + epr.X[0] = (uint32_t)tmp; + epr.X[1] = (uint32_t)r[0]; + epr.X[2] = (uint32_t)r[1]; + } else { + epr.index = -ind; + epr.X[0] = (uint32_t)r[0]; + epr.X[1] = (uint32_t)r[1]; + epr.X[2] = (uint32_t)r[2]; + } + + *y = eprep_to_double(epr); + return epx.sign * N; +} + +double reference_relaxed_cos(double x) { + if (isnan(x)) return NAN; return (float)cos((float)x); } -double reference_cos(double x) -{ - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return cos( x ); - unsigned int c = N & 3; - switch ( c ) { - case 0: - return cos( x ); - case 1: - return -sin( x ); - case 2: - return -cos( x ); - case 3: - return sin( x ); - } - return 0.0; +double reference_cos(double x) { + int exception; + int N = payne_hanek(&x, &exception); + if (exception) return cos(x); + unsigned int c = N & 3; + switch (c) { + case 0: + return cos(x); + case 1: + return -sin(x); + case 2: + return -cos(x); + case 3: + return sin(x); + } + return 0.0; } double reference_relaxed_sin(double x){ return (float)sin((float)x); } -double reference_sin(double x) -{ - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return sin( x ); - int c = N & 3; - switch ( c ) { - case 0: - return sin( x ); - case 1: - return cos( x ); - case 2: - return -sin( x ); - case 3: - return -cos( x ); - } - return 0.0; +double reference_sin(double x) { + int exception; + int N = payne_hanek(&x, &exception); + if (exception) return sin(x); + int c = N & 3; + switch (c) { + case 0: + return sin(x); + case 1: + return cos(x); + case 2: + return -sin(x); + case 3: + return -cos(x); + } + return 0.0; } double reference_relaxed_sincos(double x, double * y){ @@ -3899,165 +4074,152 @@ double reference_relaxed_sincos(double x, double * y){ return reference_relaxed_sin(x); } -double reference_sincos(double x, double *y) -{ - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) { - *y = cos( x ); - return sin( x ); - } - int c = N & 3; - switch ( c ) { - case 0: - *y = cos( x ); - return sin( x ); - case 1: - *y = -sin( x ); - return cos( x ); - case 2: - *y = -cos( x ); - return -sin( x ); - case 3: - *y = sin( x ); - return -cos( x ); - } - return 0.0; +double reference_sincos(double x, double* y) { + int exception; + int N = payne_hanek(&x, &exception); + if (exception) { + *y = cos(x); + return sin(x); + } + int c = N & 3; + switch (c) { + case 0: + *y = cos(x); + return sin(x); + case 1: + *y = -sin(x); + return cos(x); + case 2: + *y = -cos(x); + return -sin(x); + case 3: + *y = sin(x); + return -cos(x); + } + return 0.0; } double reference_relaxed_tan(double x){ return ((float) reference_relaxed_sin((float)x))/((float) reference_relaxed_cos((float)x)); } -double reference_tan(double x) -{ - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return tan( x ); - int c = N & 3; - switch ( c ) { - case 0: - return tan( x ); - case 1: - return -1.0 / tan( x ); - case 2: - return tan( x ); - case 3: - return -1.0 / tan( x ); - } - return 0.0; -} - -long double reference_cosl(long double xx) -{ - double x = (double) xx; - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return cosl( x ); - unsigned int c = N & 3; - switch ( c ) { - case 0: - return cosl( x ); - case 1: - return -sinl( x ); - case 2: - return -cosl( x ); - case 3: - return sinl( x ); - } - return 0.0; -} - -long double reference_sinl(long double xx) -{ - // we use system tanl after reduction which - // can flush denorm input to zero so - //take care of it here. - if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 )) - return xx; - - double x = (double) xx; - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return sinl( x ); - int c = N & 3; - switch ( c ) { - case 0: - return sinl( x ); - case 1: - return cosl( x ); - case 2: - return -sinl( x ); - case 3: - return -cosl( x ); - } - return 0.0; -} - -long double reference_sincosl(long double xx, long double *y) -{ - // we use system tanl after reduction which - // can flush denorm input to zero so - //take care of it here. - if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 )) - { - *y = cosl(xx); - return xx; - } - - double x = (double) xx; - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) { - *y = cosl( x ); - return sinl( x ); - } - int c = N & 3; - switch ( c ) { - case 0: - *y = cosl( x ); - return sinl( x ); - case 1: - *y = -sinl( x ); - return cosl( x ); - case 2: - *y = -cosl( x ); - return -sinl( x ); - case 3: - *y = sinl( x ); - return -cosl( x ); - } - return 0.0; -} - -long double reference_tanl(long double xx) -{ - // we use system tanl after reduction which - // can flush denorm input to zero so - //take care of it here. - if(reference_fabsl(xx) < HEX_DBL( +, 1, 0, -, 1022 )) - return xx; - - double x = (double) xx; - int exception; - int N = payne_hanek( &x, &exception ); - if( exception ) - return tanl( x ); - int c = N & 3; - switch ( c ) { - case 0: - return tanl( x ); - case 1: - return -1.0 / tanl( x ); - case 2: - return tanl( x ); - case 3: - return -1.0 / tanl( x ); - } - return 0.0; +double reference_tan(double x) { + int exception; + int N = payne_hanek(&x, &exception); + if (exception) return tan(x); + int c = N & 3; + switch (c) { + case 0: + return tan(x); + case 1: + return -1.0 / tan(x); + case 2: + return tan(x); + case 3: + return -1.0 / tan(x); + } + return 0.0; +} + +long double reference_cosl(long double xx) { + double x = (double)xx; + int exception; + int N = payne_hanek(&x, &exception); + if (exception) return cosl(x); + unsigned int c = N & 3; + switch (c) { + case 0: + return cosl(x); + case 1: + return -sinl(x); + case 2: + return -cosl(x); + case 3: + return sinl(x); + } + return 0.0; +} + +long double reference_sinl(long double xx) { + // we use system tanl after reduction which + // can flush denorm input to zero so + // take care of it here. + if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx; + + double x = (double)xx; + int exception; + int N = payne_hanek(&x, &exception); + if (exception) return sinl(x); + int c = N & 3; + switch (c) { + case 0: + return sinl(x); + case 1: + return cosl(x); + case 2: + return -sinl(x); + case 3: + return -cosl(x); + } + return 0.0; +} + +long double reference_sincosl(long double xx, long double* y) { + // we use system tanl after reduction which + // can flush denorm input to zero so + // take care of it here. + if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) { + *y = cosl(xx); + return xx; + } + + double x = (double)xx; + int exception; + int N = payne_hanek(&x, &exception); + if (exception) { + *y = cosl(x); + return sinl(x); + } + int c = N & 3; + switch (c) { + case 0: + *y = cosl(x); + return sinl(x); + case 1: + *y = -sinl(x); + return cosl(x); + case 2: + *y = -cosl(x); + return -sinl(x); + case 3: + *y = sinl(x); + return -cosl(x); + } + return 0.0; +} + +long double reference_tanl(long double xx) { + // we use system tanl after reduction which + // can flush denorm input to zero so + // take care of it here. + if (reference_fabsl(xx) < HEX_DBL(+, 1, 0, -, 1022)) return xx; + + double x = (double)xx; + int exception; + int N = payne_hanek(&x, &exception); + if (exception) return tanl(x); + int c = N & 3; + switch (c) { + case 0: + return tanl(x); + case 1: + return -1.0 / tanl(x); + case 2: + return tanl(x); + case 3: + return -1.0 / tanl(x); + } + return 0.0; } static double __loglTable1[64][3] = { @@ -4205,552 +4367,491 @@ static double __loglTable3[8][3] = { {HEX_DBL( +, 1, ffc8061f5492b, -, 1 ), HEX_DBL( +, 1, 43183c878274e, -, 11 ), HEX_DBL( +, 1, 5885dd1eb6582, -, 65 )} }; -static void __log2_ep(double *hi, double *lo, double x) -{ - union { uint64_t i; double d; } uu; - - int m; - double f = reference_frexp(x, &m); - - // bring f in [0.75, 1.5) - if( f < 0.75 ) { - f *= 2.0; - m -= 1; - } - - // index first table .... brings down to [1-2^-7, 1+2^6) - uu.d = f; - int index = (int) (((uu.i + ((uint64_t) 1 << 51)) & 0x000fc00000000000ULL) >> 46); - double r1 = __loglTable1[index][0]; - double logr1hi = __loglTable1[index][1]; - double logr1lo = __loglTable1[index][2]; - // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since |m| <= 1023 - // which needs 10bits at max, we can directly add m to log1hi without spilling - logr1hi += m; - - // argument reduction needs to be in double-double since reduced argument will form the - // leading term of polynomial approximation which sets the precision we eventually achieve - double zhi, zlo; - MulD(&zhi, &zlo, r1, uu.d); - - // second index table .... brings down to [1-2^-12, 1+2^-11) - uu.d = zhi; - index = (int) (((uu.i + ((uint64_t) 1 << 46)) & 0x00007e0000000000ULL) >> 41); - double r2 = __loglTable2[index][0]; - double logr2hi = __loglTable2[index][1]; - double logr2lo = __loglTable2[index][2]; - - // reduce argument - MulDD(&zhi, &zlo, zhi, zlo, r2, 0.0); - - // third index table .... brings down to [1-2^-14, 1+2^-13) - // Actually reduction to 2^-11 would have been sufficient to calculate - // second order term in polynomial in double rather than double-double, I - // reduced it a bit more to make sure other systematic arithmetic errors - // are guarded against .... also this allow lower order product of leading polynomial - // term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather than double-double ... - // hence only term that needs to be done in double-double is Ao_hi*z_hi - uu.d = zhi; - index = (int) (((uu.i + ((uint64_t) 1 << 41)) & 0x0000038000000000ULL) >> 39); - double r3 = __loglTable3[index][0]; - double logr3hi = __loglTable3[index][1]; - double logr3lo = __loglTable3[index][2]; - - // log2(x) = m + log2(r1) + log2(r1) + log2(1 + (zh + zlo)) - // calculate sum of first three terms ... note that m has already - // been added to log2(r1)_hi - double log2hi, log2lo; - AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo); - AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo); - - // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after this - MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0); - // we dont need to do full double-double substract here. substracting 1.0 for higher - // term is exact - zhi = zhi - 1.0; - // normalize - AddD(&zhi, &zlo, zhi, zlo); - - // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit - // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in [-2^-14, 2^-13] - // log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4 - // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5 - // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 + A4*z^3) - // since we are looking for at least 64 digits of precision and z in [-2^-14, 2^-13], final term - // can be done in double .... also Aolo*zhi + Aohi*zlo can be done in double .... - // Aohi*zhi needs to be done in double-double - - double Aohi = HEX_DBL( +, 1, 71547652b82fe, +, 0 ); - double Aolo = HEX_DBL( +, 1, 777c9cbb675c, -, 56 ); - double y; - y = HEX_DBL( +, 1, 276d2736fade7, -, 2 ); - y = HEX_DBL( -, 1, 7154765782df1, -, 2 ) + y*zhi; - y = HEX_DBL( +, 1, ec709dc3a0f67, -, 2 ) + y*zhi; - y = HEX_DBL( -, 1, 71547652b82fe, -, 1 ) + y*zhi; - double zhisq = zhi*zhi; - y = y*zhisq; - y = y + zhi*Aolo; - y = y + zlo*Aohi; - - MulD(&zhi, &zlo, Aohi, zhi); - AddDD(&zhi, &zlo, zhi, zlo, y, 0.0); - AddDD(&zhi, &zlo, zhi, zlo, log2hi, log2lo); - - *hi = zhi; - *lo = zlo; -} - -long double reference_powl( long double x, long double y ) -{ - - - // this will be used for testing doubles i.e. arguments will - // be doubles so cast the input back to double ... returned - // result will be long double though .... > 53 bits of precision - // if platform allows. - // =========== - // New finding. - // =========== - // this function is getting used for computing reference cube root (cbrt) - // as follows __powl( x, 1.0L/3.0L ) so if the y are assumed to - // be double and is converted from long double to double, truncation - // causes errors. So we need to tread y as long double and convert it - // to hi, lo doubles when performing y*log2(x). - -// double x = (double) xx; -// double y = (double) yy; - - static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 ); - - //if x = 1, return x for any y, even NaN - if( x == 1.0 ) - return x; - - //if y == 0, return 1 for any x, even NaN - if( y == 0.0 ) - return 1.0L; - - //get NaNs out of the way - if( x != x || y != y ) - return x + y; - - //do the work required to sort out edge cases - double fabsy = reference_fabs( y ); - double fabsx = reference_fabs( x ); - double iy = reference_rint( fabsy ); //we do round to nearest here so that |fy| <= 0.5 - if( iy > fabsy )//convert nearbyint to floor - iy -= 1.0; - int isOddInt = 0; - if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon ) - isOddInt = (int) (iy - 2.0 * rint( 0.5 * iy )); //might be 0, -1, or 1 - - ///test a few more edge cases - //deal with x == 0 cases - if( x == 0.0 ) - { - if( ! isOddInt ) - x = 0.0; - - if( y < 0 ) - x = 1.0/ x; - - return x; - } - - //x == +-Inf cases - if( isinf(fabsx) ) - { - if( x < 0 ) - { - if( isOddInt ) - { - if( y < 0 ) - return -0.0; - else - return -INFINITY; - } - else - { - if( y < 0 ) - return 0.0; - else - return INFINITY; - } - } - - if( y < 0 ) - return 0; - return INFINITY; - } - - //y = +-inf cases - if( isinf(fabsy) ) - { - if( x == -1 ) - return 1; - - if( y < 0 ) - { - if( fabsx < 1 ) - return INFINITY; - return 0; - } - if( fabsx < 1 ) - return 0; - return INFINITY; - } - - // x < 0 and y non integer case - if( x < 0 && iy != fabsy ) - { - //return nan; - return cl_make_nan(); - } - - //speedy resolution of sqrt and reciprocal sqrt - if( fabsy == 0.5 ) - { - long double xl = sqrtl( x ); - if( y < 0 ) - xl = 1.0/ xl; - return xl; - } - - double log2x_hi, log2x_lo; - - // extended precision log .... accurate to at least 64-bits + couple of guard bits - __log2_ep(&log2x_hi, &log2x_lo, fabsx); - - double ylog2x_hi, ylog2x_lo; - - double y_hi = (double) y; - double y_lo = (double) ( y - (long double) y_hi); - - // compute product of y*log2(x) - // scale to avoid overflow in double-double multiplication - if( reference_fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) { - y_hi = reference_ldexp(y_hi, -53); - y_lo = reference_ldexp(y_lo, -53); - } - MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); - if( fabs( y ) > HEX_DBL( +, 1, 0, +, 970 ) ) { - ylog2x_hi = reference_ldexp(ylog2x_hi, 53); - ylog2x_lo = reference_ldexp(ylog2x_lo, 53); - } - - long double powxy; - if(isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) { - powxy = reference_signbit(ylog2x_hi) ? HEX_DBL( +, 0, 0, +, 0 ) : INFINITY; - } else { - // separate integer + fractional part - long int m = lrint(ylog2x_hi); - AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0); - - // revert to long double arithemtic - long double ylog2x = (long double) ylog2x_hi + (long double) ylog2x_lo; - long double tmp = reference_exp2l( ylog2x ); - powxy = reference_scalblnl(tmp, m); +static void __log2_ep(double* hi, double* lo, double x) { + union { + uint64_t i; + double d; + } uu; + + int m; + double f = reference_frexp(x, &m); + + // bring f in [0.75, 1.5) + if (f < 0.75) { + f *= 2.0; + m -= 1; + } + + // index first table .... brings down to [1-2^-7, 1+2^6) + uu.d = f; + int index = + (int)(((uu.i + ((uint64_t)1 << 51)) & 0x000fc00000000000ULL) >> 46); + double r1 = __loglTable1[index][0]; + double logr1hi = __loglTable1[index][1]; + double logr1lo = __loglTable1[index][2]; + // since log1rhi has 39 bits of precision, we have 14 bit in hand ... since + // |m| <= 1023 which needs 10bits at max, we can directly add m to log1hi + // without spilling + logr1hi += m; + + // argument reduction needs to be in double-double since reduced argument will + // form the leading term of polynomial approximation which sets the precision + // we eventually achieve + double zhi, zlo; + MulD(&zhi, &zlo, r1, uu.d); + + // second index table .... brings down to [1-2^-12, 1+2^-11) + uu.d = zhi; + index = (int)(((uu.i + ((uint64_t)1 << 46)) & 0x00007e0000000000ULL) >> 41); + double r2 = __loglTable2[index][0]; + double logr2hi = __loglTable2[index][1]; + double logr2lo = __loglTable2[index][2]; + + // reduce argument + MulDD(&zhi, &zlo, zhi, zlo, r2, 0.0); + + // third index table .... brings down to [1-2^-14, 1+2^-13) + // Actually reduction to 2^-11 would have been sufficient to calculate + // second order term in polynomial in double rather than double-double, I + // reduced it a bit more to make sure other systematic arithmetic errors + // are guarded against .... also this allow lower order product of leading + // polynomial term i.e. Ao_hi*z_lo + Ao_lo*z_hi to be done in double rather + // than double-double ... hence only term that needs to be done in + // double-double is Ao_hi*z_hi + uu.d = zhi; + index = (int)(((uu.i + ((uint64_t)1 << 41)) & 0x0000038000000000ULL) >> 39); + double r3 = __loglTable3[index][0]; + double logr3hi = __loglTable3[index][1]; + double logr3lo = __loglTable3[index][2]; + + // log2(x) = m + log2(r1) + log2(r1) + log2(1 + (zh + zlo)) + // calculate sum of first three terms ... note that m has already + // been added to log2(r1)_hi + double log2hi, log2lo; + AddDD(&log2hi, &log2lo, logr1hi, logr1lo, logr2hi, logr2lo); + AddDD(&log2hi, &log2lo, logr3hi, logr3lo, log2hi, log2lo); + + // final argument reduction .... zhi will be in [1-2^-14, 1+2^-13) after this + MulDD(&zhi, &zlo, zhi, zlo, r3, 0.0); + // we dont need to do full double-double substract here. substracting 1.0 for + // higher term is exact + zhi = zhi - 1.0; + // normalize + AddD(&zhi, &zlo, zhi, zlo); + + // polynomail fitting to compute log2(1 + z) ... forth order polynomial fit + // to log2(1 + z)/z gives minimax absolute error of O(2^-76) with z in + // [-2^-14, 2^-13] log2(1 + z)/z = Ao + A1*z + A2*z^2 + A3*z^3 + A4*z^4 + // => log2(1 + z) = Ao*z + A1*z^2 + A2*z^3 + A3*z^4 + A4*z^5 + // => log2(1 + z) = (Aohi + Aolo)*(zhi + zlo) + z^2*(A1 + A2*z + A3*z^2 + + // A4*z^3) since we are looking for at least 64 digits of precision and z in + // [-2^-14, 2^-13], final term can be done in double .... also Aolo*zhi + + // Aohi*zlo can be done in double .... Aohi*zhi needs to be done in + // double-double + + double Aohi = HEX_DBL(+, 1, 71547652b82fe, +, 0); + double Aolo = HEX_DBL(+, 1, 777c9cbb675c, -, 56); + double y; + y = HEX_DBL(+, 1, 276d2736fade7, -, 2); + y = HEX_DBL(-, 1, 7154765782df1, -, 2) + y * zhi; + y = HEX_DBL(+, 1, ec709dc3a0f67, -, 2) + y * zhi; + y = HEX_DBL(-, 1, 71547652b82fe, -, 1) + y * zhi; + double zhisq = zhi * zhi; + y = y * zhisq; + y = y + zhi * Aolo; + y = y + zlo * Aohi; + + MulD(&zhi, &zlo, Aohi, zhi); + AddDD(&zhi, &zlo, zhi, zlo, y, 0.0); + AddDD(&zhi, &zlo, zhi, zlo, log2hi, log2lo); + + *hi = zhi; + *lo = zlo; +} + +long double reference_powl(long double x, long double y) { + // this will be used for testing doubles i.e. arguments will + // be doubles so cast the input back to double ... returned + // result will be long double though .... > 53 bits of precision + // if platform allows. + // =========== + // New finding. + // =========== + // this function is getting used for computing reference cube root (cbrt) + // as follows __powl( x, 1.0L/3.0L ) so if the y are assumed to + // be double and is converted from long double to double, truncation + // causes errors. So we need to tread y as long double and convert it + // to hi, lo doubles when performing y*log2(x). + + // double x = (double) xx; + // double y = (double) yy; + + static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53); + + // if x = 1, return x for any y, even NaN + if (x == 1.0) return x; + + // if y == 0, return 1 for any x, even NaN + if (y == 0.0) return 1.0L; + + // get NaNs out of the way + if (x != x || y != y) return x + y; + + // do the work required to sort out edge cases + double fabsy = reference_fabs(y); + double fabsx = reference_fabs(x); + double iy = + reference_rint(fabsy); // we do round to nearest here so that |fy| <= 0.5 + if (iy > fabsy) // convert nearbyint to floor + iy -= 1.0; + int isOddInt = 0; + if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon) + isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1 + + /// test a few more edge cases + // deal with x == 0 cases + if (x == 0.0) { + if (!isOddInt) x = 0.0; + + if (y < 0) x = 1.0 / x; + + return x; + } + + // x == +-Inf cases + if (isinf(fabsx)) { + if (x < 0) { + if (isOddInt) { + if (y < 0) + return -0.0; + else + return -INFINITY; + } else { + if (y < 0) + return 0.0; + else + return INFINITY; + } } - - // if y is odd integer and x is negative, reverse sign - if( isOddInt & reference_signbit(x)) - powxy = -powxy; - return powxy; + + if (y < 0) return 0; + return INFINITY; + } + + // y = +-inf cases + if (isinf(fabsy)) { + if (x == -1) return 1; + + if (y < 0) { + if (fabsx < 1) return INFINITY; + return 0; + } + if (fabsx < 1) return 0; + return INFINITY; + } + + // x < 0 and y non integer case + if (x < 0 && iy != fabsy) { + // return nan; + return cl_make_nan(); + } + + // speedy resolution of sqrt and reciprocal sqrt + if (fabsy == 0.5) { + long double xl = sqrtl(x); + if (y < 0) xl = 1.0 / xl; + return xl; + } + + double log2x_hi, log2x_lo; + + // extended precision log .... accurate to at least 64-bits + couple of guard + // bits + __log2_ep(&log2x_hi, &log2x_lo, fabsx); + + double ylog2x_hi, ylog2x_lo; + + double y_hi = (double)y; + double y_lo = (double)(y - (long double)y_hi); + + // compute product of y*log2(x) + // scale to avoid overflow in double-double multiplication + if (reference_fabs(y) > HEX_DBL(+, 1, 0, +, 970)) { + y_hi = reference_ldexp(y_hi, -53); + y_lo = reference_ldexp(y_lo, -53); + } + MulDD(&ylog2x_hi, &ylog2x_lo, log2x_hi, log2x_lo, y_hi, y_lo); + if (fabs(y) > HEX_DBL(+, 1, 0, +, 970)) { + ylog2x_hi = reference_ldexp(ylog2x_hi, 53); + ylog2x_lo = reference_ldexp(ylog2x_lo, 53); + } + + long double powxy; + if (isinf(ylog2x_hi) || (reference_fabs(ylog2x_hi) > 2200)) { + powxy = reference_signbit(ylog2x_hi) ? HEX_DBL(+, 0, 0, +, 0) : INFINITY; + } else { + // separate integer + fractional part + long int m = lrint(ylog2x_hi); + AddDD(&ylog2x_hi, &ylog2x_lo, ylog2x_hi, ylog2x_lo, -m, 0.0); + + // revert to long double arithemtic + long double ylog2x = (long double)ylog2x_hi + (long double)ylog2x_lo; + long double tmp = reference_exp2l(ylog2x); + powxy = reference_scalblnl(tmp, m); + } + + // if y is odd integer and x is negative, reverse sign + if (isOddInt & reference_signbit(x)) powxy = -powxy; + return powxy; } -double reference_nextafter(double xx, double yy) -{ - float x = (float) xx; - float y = (float) yy; - - // take care of nans - if( x != x ) - return x; - - if( y != y ) - return y; - - if( x == y ) - return y; - - int32f_t a, b; - - a.f = x; - b.f = y; - - if( a.i & 0x80000000 ) - a.i = 0x80000000 - a.i; - if(b.i & 0x80000000 ) - b.i = 0x80000000 - b.i; - - a.i += (a.i < b.i) ? 1 : -1; - a.i = (a.i < 0) ? (cl_int) 0x80000000 - a.i : a.i; - - return a.f; -} - - -long double reference_nextafterl(long double xx, long double yy) -{ - double x = (double) xx; - double y = (double) yy; - - // take care of nans - if( x != x ) - return x; - - if( y != y ) - return y; - - int64d_t a, b; - - a.d = x; - b.d = y; - - int64_t tmp = 0x8000000000000000LL; - - if( a.l & tmp ) - a.l = tmp - a.l; - if(b.l & tmp ) - b.l = tmp - b.l; - - // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y = 0.0f) - // test needs to be done using integer rep because - // subnormals may be flushed to zero on some platforms - if( a.l == b.l ) - return y; - - a.l += (a.l < b.l) ? 1 : -1; - a.l = (a.l < 0) ? tmp - a.l : a.l; - - return a.d; -} - -double reference_fdim(double xx, double yy) -{ - float x = (float) xx; - float y = (float) yy; - - if( x != x ) - return x; - - if( y != y ) - return y; - - float r = ( x > y ) ? (float) reference_subtract( x, y) : 0.0f; - return r; +double reference_nextafter(double xx, double yy) { + float x = (float)xx; + float y = (float)yy; + + // take care of nans + if (x != x) return x; + + if (y != y) return y; + if (x == y) return y; + + int32f_t a, b; + + a.f = x; + b.f = y; + + if (a.i & 0x80000000) a.i = 0x80000000 - a.i; + if (b.i & 0x80000000) b.i = 0x80000000 - b.i; + + a.i += (a.i < b.i) ? 1 : -1; + a.i = (a.i < 0) ? (cl_int)0x80000000 - a.i : a.i; + + return a.f; } +long double reference_nextafterl(long double xx, long double yy) { + double x = (double)xx; + double y = (double)yy; -long double reference_fdiml(long double xx, long double yy) -{ - double x = (double) xx; - double y = (double) yy; - - if( x != x ) - return x; - - if( y != y ) - return y; - - double r = ( x > y ) ? (double) reference_subtractl(x, y) : 0.0; - return r; -} - -double reference_remquo(double xd, double yd, int *n) -{ - float xx = (float) xd; - float yy = (float) yd; - - if( isnan(xx) || isnan(yy) || - fabsf(xx) == INFINITY || - yy == 0.0 ) - { - *n = 0; - return cl_make_nan(); - } + // take care of nans + if (x != x) return x; - if( fabsf(yy) == INFINITY || xx == 0.0f ) { - *n = 0; - return xd; - } - - if( fabsf(xx) == fabsf(yy) ) { - *n = (xx == yy) ? 1 : -1; - return reference_signbit( xx ) ? -0.0 : 0.0; - } - - int signx = reference_signbit( xx ) ? -1 : 1; - int signy = reference_signbit( yy ) ? -1 : 1; - int signn = (signx == signy) ? 1 : -1; - float x = fabsf(xx); - float y = fabsf(yy); - - int ex, ey; - ex = reference_ilogb( x ); - ey = reference_ilogb( y ); - float xr = x; - float yr = y; - uint32_t q = 0; - - if(ex-ey >= -1) { - - yr = (float) reference_ldexp( y, -ey ); - xr = (float) reference_ldexp( x, -ex ); - - if(ex-ey >= 0) { - - - int i; - for(i = ex-ey; i > 0; i--) { - q <<= 1; - if(xr >= yr) { - xr -= yr; - q += 1; - } - xr += xr; - } - q <<= 1; - if( xr > yr ) { - xr -= yr; - q += 1; - } + if (y != y) return y; + + int64d_t a, b; + + a.d = x; + b.d = y; + + int64_t tmp = 0x8000000000000000LL; + + if (a.l & tmp) a.l = tmp - a.l; + if (b.l & tmp) b.l = tmp - b.l; + + // edge case. if (x == y) or (x = 0.0f and y = -0.0f) or (x = -0.0f and y = + // 0.0f) test needs to be done using integer rep because subnormals may be + // flushed to zero on some platforms + if (a.l == b.l) return y; + + a.l += (a.l < b.l) ? 1 : -1; + a.l = (a.l < 0) ? tmp - a.l : a.l; + + return a.d; +} + +double reference_fdim(double xx, double yy) { + float x = (float)xx; + float y = (float)yy; + + if (x != x) return x; + + if (y != y) return y; + + float r = (x > y) ? (float)reference_subtract(x, y) : 0.0f; + return r; +} + +long double reference_fdiml(long double xx, long double yy) { + double x = (double)xx; + double y = (double)yy; + + if (x != x) return x; + + if (y != y) return y; + + double r = (x > y) ? (double)reference_subtractl(x, y) : 0.0; + return r; +} + +double reference_remquo(double xd, double yd, int* n) { + float xx = (float)xd; + float yy = (float)yd; + + if (isnan(xx) || isnan(yy) || fabsf(xx) == INFINITY || yy == 0.0) { + *n = 0; + return cl_make_nan(); + } + + if (fabsf(yy) == INFINITY || xx == 0.0f) { + *n = 0; + return xd; + } + + if (fabsf(xx) == fabsf(yy)) { + *n = (xx == yy) ? 1 : -1; + return reference_signbit(xx) ? -0.0 : 0.0; + } + + int signx = reference_signbit(xx) ? -1 : 1; + int signy = reference_signbit(yy) ? -1 : 1; + int signn = (signx == signy) ? 1 : -1; + float x = fabsf(xx); + float y = fabsf(yy); + + int ex, ey; + ex = reference_ilogb(x); + ey = reference_ilogb(y); + float xr = x; + float yr = y; + uint32_t q = 0; + + if (ex - ey >= -1) { + yr = (float)reference_ldexp(y, -ey); + xr = (float)reference_ldexp(x, -ex); + + if (ex - ey >= 0) { + int i; + for (i = ex - ey; i > 0; i--) { + q <<= 1; + if (xr >= yr) { + xr -= yr; + q += 1; } - else //ex-ey = -1 - xr = reference_ldexp(xr, ex-ey); - } - - if( (yr < 2.0f*xr) || ( (yr == 2.0f*xr) && (q & 0x00000001) ) ) { - xr -= yr; - q += 1; - } - - if(ex-ey >= -1) - xr = reference_ldexp(xr, ey); - - int qout = q & 0x0000007f; - if( signn < 0) - qout = -qout; - if( xx < 0.0 ) - xr = -xr; - - *n = qout; - - return xr; -} - -long double reference_remquol(long double xd, long double yd, int *n) -{ + xr += xr; + } + q <<= 1; + if (xr > yr) { + xr -= yr; + q += 1; + } + } else // ex-ey = -1 + xr = reference_ldexp(xr, ex - ey); + } - double xx = (double) xd; - double yy = (double) yd; - - if( isnan(xx) || isnan(yy) || - fabs(xx) == INFINITY || - yy == 0.0 ) - { - *n = 0; - return cl_make_nan(); - } + if ((yr < 2.0f * xr) || ((yr == 2.0f * xr) && (q & 0x00000001))) { + xr -= yr; + q += 1; + } - if( reference_fabs(yy) == INFINITY || xx == 0.0 ) { - *n = 0; - return xd; - } - - if( reference_fabs(xx) == reference_fabs(yy) ) { - *n = (xx == yy) ? 1 : -1; - return reference_signbit( xx ) ? -0.0 : 0.0; - } - - int signx = reference_signbit( xx ) ? -1 : 1; - int signy = reference_signbit( yy ) ? -1 : 1; - int signn = (signx == signy) ? 1 : -1; - double x = reference_fabs(xx); - double y = reference_fabs(yy); - - int ex, ey; - ex = reference_ilogbl( x ); - ey = reference_ilogbl( y ); - double xr = x; - double yr = y; - uint32_t q = 0; - - if(ex-ey >= -1) { - - yr = reference_ldexp( y, -ey ); - xr = reference_ldexp( x, -ex ); - int i; - - if(ex-ey >= 0) { - - for(i = ex-ey; i > 0; i--) { - q <<= 1; - if(xr >= yr) { - xr -= yr; - q += 1; - } - xr += xr; - } - q <<= 1; - if( xr > yr ) { - xr -= yr; - q += 1; - } + if (ex - ey >= -1) xr = reference_ldexp(xr, ey); + + int qout = q & 0x0000007f; + if (signn < 0) qout = -qout; + if (xx < 0.0) xr = -xr; + + *n = qout; + + return xr; +} + +long double reference_remquol(long double xd, long double yd, int* n) { + double xx = (double)xd; + double yy = (double)yd; + + if (isnan(xx) || isnan(yy) || fabs(xx) == INFINITY || yy == 0.0) { + *n = 0; + return cl_make_nan(); + } + + if (reference_fabs(yy) == INFINITY || xx == 0.0) { + *n = 0; + return xd; + } + + if (reference_fabs(xx) == reference_fabs(yy)) { + *n = (xx == yy) ? 1 : -1; + return reference_signbit(xx) ? -0.0 : 0.0; + } + + int signx = reference_signbit(xx) ? -1 : 1; + int signy = reference_signbit(yy) ? -1 : 1; + int signn = (signx == signy) ? 1 : -1; + double x = reference_fabs(xx); + double y = reference_fabs(yy); + + int ex, ey; + ex = reference_ilogbl(x); + ey = reference_ilogbl(y); + double xr = x; + double yr = y; + uint32_t q = 0; + + if (ex - ey >= -1) { + yr = reference_ldexp(y, -ey); + xr = reference_ldexp(x, -ex); + int i; + + if (ex - ey >= 0) { + for (i = ex - ey; i > 0; i--) { + q <<= 1; + if (xr >= yr) { + xr -= yr; + q += 1; } - else - xr = reference_ldexp(xr, ex-ey); - } - - if( (yr < 2.0*xr) || ( (yr == 2.0*xr) && (q & 0x00000001) ) ) { - xr -= yr; - q += 1; - } - - if(ex-ey >= -1) - xr = reference_ldexp(xr, ey); - - int qout = q & 0x0000007f; - if( signn < 0) - qout = -qout; - if( xx < 0.0 ) - xr = -xr; - - *n = qout; - return xr; -} - -static double reference_scalbn(double x, int n) -{ - if(reference_isinf(x) || reference_isnan(x) || x == 0.0) - return x; - - int bias = 1023; - union { double d; cl_long l; } u; - u.d = (double) x; - int e = (int)((u.l & 0x7ff0000000000000LL) >> 52); - if(e == 0) - { - u.l |= ((cl_long)1023 << 52); - u.d -= 1.0; - e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022; - } - e += n; - if(e >= 2047 || n >= 2098 ) - return reference_copysign(INFINITY, x); - if(e < -51 || n <-2097 ) - return reference_copysign(0.0, x); - if(e <= 0) - { - bias += (e-1); - e = 1; - } - u.l &= 0x800fffffffffffffLL; - u.l |= ((cl_long)e << 52); - x = u.d; - u.l = ((cl_long)bias << 52); - return x * u.d; + xr += xr; + } + q <<= 1; + if (xr > yr) { + xr -= yr; + q += 1; + } + } else + xr = reference_ldexp(xr, ex - ey); + } + + if ((yr < 2.0 * xr) || ((yr == 2.0 * xr) && (q & 0x00000001))) { + xr -= yr; + q += 1; + } + + if (ex - ey >= -1) xr = reference_ldexp(xr, ey); + + int qout = q & 0x0000007f; + if (signn < 0) qout = -qout; + if (xx < 0.0) xr = -xr; + + *n = qout; + return xr; +} + +static double reference_scalbn(double x, int n) { + if (reference_isinf(x) || reference_isnan(x) || x == 0.0) return x; + + int bias = 1023; + union { + double d; + cl_long l; + } u; + u.d = (double)x; + int e = (int)((u.l & 0x7ff0000000000000LL) >> 52); + if (e == 0) { + u.l |= ((cl_long)1023 << 52); + u.d -= 1.0; + e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022; + } + e += n; + if (e >= 2047 || n >= 2098) return reference_copysign(INFINITY, x); + if (e < -51 || n < -2097) return reference_copysign(0.0, x); + if (e <= 0) { + bias += (e - 1); + e = 1; + } + u.l &= 0x800fffffffffffffLL; + u.l |= ((cl_long)e << 52); + x = u.d; + u.l = ((cl_long)bias << 52); + return x * u.d; } static long double reference_scalblnl(long double x, long n) @@ -4812,9 +4913,9 @@ static long double reference_scalblnl(long double x, long n) int e = (int)((u.l & 0x7ff0000000000000LL) >> 52); if(e == 0) { - u.l |= ((cl_long)1023 << 52); - u.d -= 1.0; - e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022; + u.l |= ((cl_long)1023 << 52); + u.d -= 1.0; + e = (int)((u.l & 0x7ff0000000000000LL) >> 52) - 1022; } e += n; if(e >= 2047) @@ -4834,7 +4935,7 @@ static long double reference_scalblnl(long double x, long n) #endif #else // PPC - return scalblnl(x, n); + return scalblnl(x, n); #endif } @@ -4871,297 +4972,233 @@ long double reference_expl(long double x) } return expl(x + bias) * scale; #else - return expl( x ); + return expl(x); #endif } -double reference_sinh(double x) -{ - return sinh(x); -} +double reference_sinh(double x) { return sinh(x); } -long double reference_sinhl(long double x) -{ - return sinhl(x); -} +long double reference_sinhl(long double x) { return sinhl(x); } -double reference_fmod(double x, double y) -{ - if( x == 0.0 && fabs(y) > 0.0 ) - return x; +double reference_fmod(double x, double y) { + if (x == 0.0 && fabs(y) > 0.0) return x; - if( fabs(x) == INFINITY || y == 0 ) - return cl_make_nan(); + if (fabs(x) == INFINITY || y == 0) return cl_make_nan(); - if( fabs(y) == INFINITY ) // we know x is finite from above - return x; + if (fabs(y) == INFINITY) // we know x is finite from above + return x; #if defined(_MSC_VER) && defined(_M_X64) - return fmod( x, y ); + return fmod(x, y); #else - return fmodf( (float) x, (float) y ); + return fmodf((float)x, (float)y); #endif } -long double reference_fmodl(long double x, long double y) -{ - if( x == 0.0L && fabsl(y) > 0.0L ) - return x; +long double reference_fmodl(long double x, long double y) { + if (x == 0.0L && fabsl(y) > 0.0L) return x; - if( fabsl(x) == INFINITY || y == 0.0L ) - return cl_make_nan(); + if (fabsl(x) == INFINITY || y == 0.0L) return cl_make_nan(); - if( fabsl(y) == INFINITY ) // we know x is finite from above - return x; + if (fabsl(y) == INFINITY) // we know x is finite from above + return x; - return fmod( (double) x, (double) y ); + return fmod((double)x, (double)y); } -double reference_modf(double x, double *n) -{ - float nr; +double reference_modf(double x, double* n) { + float nr; #if defined(__ANDROID__) - float yr = modff_android((float) x, &nr ); + float yr = modff_android((float)x, &nr); #else - float yr = modff((float) x, &nr); + float yr = modff((float)x, &nr); #endif - *n = nr; - return yr; + *n = nr; + return yr; } -long double reference_modfl(long double x, long double *n) -{ - double nr; - double yr = modf((double) x, &nr); - *n = nr; - return yr; +long double reference_modfl(long double x, long double* n) { + double nr; + double yr = modf((double)x, &nr); + *n = nr; + return yr; } -long double reference_fractl(long double x, long double *ip ) -{ - if(isnan(x)) { - *ip = cl_make_nan(); - return cl_make_nan(); - } - - double i; - double f = modf((double) x, &i ); - if( f < 0.0 ) - { - f = 1.0 + f; - i -= 1.0; - if( f == 1.0 ) - f = HEX_DBL( +, 1, fffffffffffff, -, 1 ); - } - *ip = i; - return f; -} +long double reference_fractl(long double x, long double* ip) { + if (isnan(x)) { + *ip = cl_make_nan(); + return cl_make_nan(); + } -long double reference_fabsl(long double x) -{ - return fabsl( x ); + double i; + double f = modf((double)x, &i); + if (f < 0.0) { + f = 1.0 + f; + i -= 1.0; + if (f == 1.0) f = HEX_DBL(+, 1, fffffffffffff, -, 1); + } + *ip = i; + return f; } +long double reference_fabsl(long double x) { return fabsl(x); } + double reference_relaxed_log( double x ) { return (float)reference_log((float)x); } -double reference_log(double x) -{ - if( x == 0.0 ) - return -INFINITY; - - if( x < 0.0 ) - return cl_make_nan(); - - if( isinf(x) ) - return INFINITY; - - double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ); - double logxHi, logxLo; - __log2_ep(&logxHi, &logxLo, x); - return logxHi*log2Hi; -} - -long double reference_logl(long double x) -{ - if( x == 0.0 ) - return -INFINITY; - - if( x < 0.0 ) - return cl_make_nan(); - - if( isinf(x) ) - return INFINITY; - - double log2Hi = HEX_DBL( +, 1, 62e42fefa39ef, -, 1 ); - double log2Lo = HEX_DBL( +, 1, abc9e3b39803f, -, 56 ); - double logxHi, logxLo; - __log2_ep(&logxHi, &logxLo, x); - - //double rhi, rlo; - //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); - //return (long double) rhi + (long double) rlo; - - long double lg2 = (long double) log2Hi + (long double) log2Lo; - long double logx = (long double) logxHi + (long double) logxLo; - return logx*lg2; +double reference_log(double x) { + if (x == 0.0) return -INFINITY; + + if (x < 0.0) return cl_make_nan(); + + if (isinf(x)) return INFINITY; + + double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1); + double logxHi, logxLo; + __log2_ep(&logxHi, &logxLo, x); + return logxHi * log2Hi; +} + +long double reference_logl(long double x) { + if (x == 0.0) return -INFINITY; + + if (x < 0.0) return cl_make_nan(); + + if (isinf(x)) return INFINITY; + + double log2Hi = HEX_DBL(+, 1, 62e42fefa39ef, -, 1); + double log2Lo = HEX_DBL(+, 1, abc9e3b39803f, -, 56); + double logxHi, logxLo; + __log2_ep(&logxHi, &logxLo, x); + + // double rhi, rlo; + // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); + // return (long double) rhi + (long double) rlo; + + long double lg2 = (long double)log2Hi + (long double)log2Lo; + long double logx = (long double)logxHi + (long double)logxLo; + return logx * lg2; } double reference_relaxed_pow( double x, double y) { return (float)reference_exp2( ((float)y) * (float)reference_log2((float)x)); } -double reference_pow( double x, double y ) -{ - static const double neg_epsilon = HEX_DBL( +, 1, 0, +, 53 ); - - //if x = 1, return x for any y, even NaN - if( x == 1.0 ) - return x; - - //if y == 0, return 1 for any x, even NaN - if( y == 0.0 ) - return 1.0; - - //get NaNs out of the way - if( x != x || y != y ) - return x + y; - - //do the work required to sort out edge cases - double fabsy = reference_fabs( y ); - double fabsx = reference_fabs( x ); - double iy = reference_rint( fabsy ); //we do round to nearest here so that |fy| <= 0.5 - if( iy > fabsy )//convert nearbyint to floor - iy -= 1.0; - int isOddInt = 0; - if( fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon ) - isOddInt = (int) (iy - 2.0 * rint( 0.5 * iy )); //might be 0, -1, or 1 - - ///test a few more edge cases - //deal with x == 0 cases - if( x == 0.0 ) - { - if( ! isOddInt ) - x = 0.0; - - if( y < 0 ) - x = 1.0/ x; - - return x; - } - - //x == +-Inf cases - if( isinf(fabsx) ) - { - if( x < 0 ) - { - if( isOddInt ) - { - if( y < 0 ) - return -0.0; - else - return -INFINITY; - } - else - { - if( y < 0 ) - return 0.0; - else - return INFINITY; - } - } - - if( y < 0 ) - return 0; - return INFINITY; - } - - //y = +-inf cases - if( isinf(fabsy) ) - { - if( x == -1 ) - return 1; - - if( y < 0 ) - { - if( fabsx < 1 ) - return INFINITY; - return 0; - } - if( fabsx < 1 ) - return 0; - return INFINITY; - } - - // x < 0 and y non integer case - if( x < 0 && iy != fabsy ) - { - //return nan; - return cl_make_nan(); - } - - //speedy resolution of sqrt and reciprocal sqrt - if( fabsy == 0.5 ) - { - long double xl = reference_sqrt( x ); - if( y < 0 ) - xl = 1.0/ xl; - return xl; - } - - double hi, lo; - __log2_ep(&hi, &lo, fabsx); - double prod = y * hi; - double result = reference_exp2(prod); - return isOddInt ? reference_copysignd(result, x) : result; -} - -double reference_sqrt(double x) -{ - return sqrt(x); -} +double reference_pow(double x, double y) { + static const double neg_epsilon = HEX_DBL(+, 1, 0, +, 53); -double reference_floor(double x) -{ - return floorf((float) x); + // if x = 1, return x for any y, even NaN + if (x == 1.0) return x; + + // if y == 0, return 1 for any x, even NaN + if (y == 0.0) return 1.0; + + // get NaNs out of the way + if (x != x || y != y) return x + y; + + // do the work required to sort out edge cases + double fabsy = reference_fabs(y); + double fabsx = reference_fabs(x); + double iy = + reference_rint(fabsy); // we do round to nearest here so that |fy| <= 0.5 + if (iy > fabsy) // convert nearbyint to floor + iy -= 1.0; + int isOddInt = 0; + if (fabsy == iy && !reference_isinf(fabsy) && iy < neg_epsilon) + isOddInt = (int)(iy - 2.0 * rint(0.5 * iy)); // might be 0, -1, or 1 + + /// test a few more edge cases + // deal with x == 0 cases + if (x == 0.0) { + if (!isOddInt) x = 0.0; + + if (y < 0) x = 1.0 / x; + + return x; + } + + // x == +-Inf cases + if (isinf(fabsx)) { + if (x < 0) { + if (isOddInt) { + if (y < 0) + return -0.0; + else + return -INFINITY; + } else { + if (y < 0) + return 0.0; + else + return INFINITY; + } + } + + if (y < 0) return 0; + return INFINITY; + } + + // y = +-inf cases + if (isinf(fabsy)) { + if (x == -1) return 1; + + if (y < 0) { + if (fabsx < 1) return INFINITY; + return 0; + } + if (fabsx < 1) return 0; + return INFINITY; + } + + // x < 0 and y non integer case + if (x < 0 && iy != fabsy) { + // return nan; + return cl_make_nan(); + } + + // speedy resolution of sqrt and reciprocal sqrt + if (fabsy == 0.5) { + long double xl = reference_sqrt(x); + if (y < 0) xl = 1.0 / xl; + return xl; + } + + double hi, lo; + __log2_ep(&hi, &lo, fabsx); + double prod = y * hi; + double result = reference_exp2(prod); + return isOddInt ? reference_copysignd(result, x) : result; } -double reference_ldexp(double value, int exponent) -{ +double reference_sqrt(double x) { return sqrt(x); } + +double reference_floor(double x) { return floorf((float)x); } + +double reference_ldexp(double value, int exponent) { #ifdef __MINGW32__ -/* - * ==================================================== - * This function is from fdlibm: http://www.netlib.org - * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. - * - * Developed at SunSoft, a Sun Microsystems, Inc. business. - * Permission to use, copy, modify, and distribute this - * software is freely granted, provided that this notice - * is preserved. - * ==================================================== - */ - if(!finite(value)||value==0.0) return value; - return scalbn(value,exponent); + /* + * ==================================================== + * This function is from fdlibm: http://www.netlib.org + * It is Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved. + * + * Developed at SunSoft, a Sun Microsystems, Inc. business. + * Permission to use, copy, modify, and distribute this + * software is freely granted, provided that this notice + * is preserved. + * ==================================================== + */ + if (!finite(value) || value == 0.0) return value; + return scalbn(value, exponent); #else - return reference_scalbn(value, exponent); + return reference_scalbn(value, exponent); #endif } -long double reference_ldexpl(long double x, int n) -{ - return ldexpl( x, n); -} +long double reference_ldexpl(long double x, int n) { return ldexpl(x, n); } -long double reference_coshl(long double x) -{ - return coshl(x); -} +long double reference_coshl(long double x) { return coshl(x); } -double reference_ceil(double x) -{ - return ceilf((float) x); -} +double reference_ceil(double x) { return ceilf((float)x); } long double reference_ceill(long double x) { @@ -5328,172 +5365,132 @@ long double reference_acosl(long double x) double reference_log10(double x) { - if( x == 0.0 ) - return -INFINITY; - - if( x < 0.0 ) - return cl_make_nan(); - - if( isinf(x) ) - return INFINITY; - - double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 ); - double logxHi, logxLo; - __log2_ep(&logxHi, &logxLo, x); - return logxHi*log2Hi; + if (x == 0.0) return -INFINITY; + + if (x < 0.0) return cl_make_nan(); + + if (isinf(x)) return INFINITY; + + double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2); + double logxHi, logxLo; + __log2_ep(&logxHi, &logxLo, x); + return logxHi * log2Hi; } long double reference_log10l(long double x) { - if( x == 0.0 ) - return -INFINITY; - - if( x < 0.0 ) - return cl_make_nan(); - - if( isinf(x) ) - return INFINITY; - - double log2Hi = HEX_DBL( +, 1, 34413509f79fe, -, 2 ); - double log2Lo = HEX_DBL( +, 1, e623e2566b02d, -, 55 ); - double logxHi, logxLo; - __log2_ep(&logxHi, &logxLo, x); - - //double rhi, rlo; - //MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); - //return (long double) rhi + (long double) rlo; - - long double lg2 = (long double) log2Hi + (long double) log2Lo; - long double logx = (long double) logxHi + (long double) logxLo; - return logx*lg2; -} - -double reference_acos(double x) -{ - return acos( x ); + if (x == 0.0) return -INFINITY; + + if (x < 0.0) return cl_make_nan(); + + if (isinf(x)) return INFINITY; + + double log2Hi = HEX_DBL(+, 1, 34413509f79fe, -, 2); + double log2Lo = HEX_DBL(+, 1, e623e2566b02d, -, 55); + double logxHi, logxLo; + __log2_ep(&logxHi, &logxLo, x); + + // double rhi, rlo; + // MulDD(&rhi, &rlo, logxHi, logxLo, log2Hi, log2Lo); + // return (long double) rhi + (long double) rlo; + + long double lg2 = (long double)log2Hi + (long double)log2Lo; + long double logx = (long double)logxHi + (long double)logxLo; + return logx * lg2; } +double reference_acos(double x) { return acos(x); } + double reference_atan2(double x, double y) { #if defined(_WIN32) - // fix edge cases for Windows - if (isinf(x) && isinf(y)) { - double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4; - return (x > 0) ? retval : -retval; - } + // fix edge cases for Windows + if (isinf(x) && isinf(y)) { + double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4; + return (x > 0) ? retval : -retval; + } #endif // _WIN32 - return atan2(x, y); + return atan2(x, y); } long double reference_atan2l(long double x, long double y) { #if defined(_WIN32) - // fix edge cases for Windows - if (isinf(x) && isinf(y)) { - long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4; - return (x > 0) ? retval : -retval; - } + // fix edge cases for Windows + if (isinf(x) && isinf(y)) { + long double retval = (y > 0) ? M_PI_4 : 3.f * M_PI_4; + return (x > 0) ? retval : -retval; + } #endif // _WIN32 - return atan2l(x, y); + return atan2l(x, y); } -double reference_frexp(double a, int *exp) -{ - if(isnan(a) || isinf(a) || a == 0.0) - { - *exp = 0; - return a; - } - - union { - cl_double d; - cl_ulong l; - } u; - - u.d = a; - - // separate out sign - cl_ulong s = u.l & 0x8000000000000000ULL; - u.l &= 0x7fffffffffffffffULL; - int bias = -1022; - - if((u.l & 0x7ff0000000000000ULL) == 0) - { - double d = u.l; - u.d = d; - bias -= 1074; - } - - int e = (int)((u.l & 0x7ff0000000000000ULL) >> 52); - u.l &= 0x000fffffffffffffULL; - e += bias; - u.l |= ((cl_ulong)1022 << 52); - u.l |= s; - - *exp = e; - return u.d; -} - -long double reference_frexpl(long double a, int *exp) -{ - if(isnan(a) || isinf(a) || a == 0.0) - { - *exp = 0; - return a; - } - - if(sizeof(long double) == sizeof(double)) - { - return reference_frexp(a, exp); - } - else - { - return frexpl(a, exp); - } -} - - -double reference_atan(double x) -{ - return atan( x ); -} +double reference_frexp(double a, int* exp) { + if (isnan(a) || isinf(a) || a == 0.0) { + *exp = 0; + return a; + } -long double reference_atanl(long double x) -{ - return atanl( x ); -} + union { + cl_double d; + cl_ulong l; + } u; -long double reference_asinl(long double x) -{ - return asinl( x ); -} + u.d = a; -double reference_asin(double x) -{ - return asin( x ); -} + // separate out sign + cl_ulong s = u.l & 0x8000000000000000ULL; + u.l &= 0x7fffffffffffffffULL; + int bias = -1022; -double reference_fabs(double x) -{ - return fabs( x); -} + if ((u.l & 0x7ff0000000000000ULL) == 0) { + double d = u.l; + u.d = d; + bias -= 1074; + } -double reference_cosh(double x) -{ - return cosh( x ); + int e = (int)((u.l & 0x7ff0000000000000ULL) >> 52); + u.l &= 0x000fffffffffffffULL; + e += bias; + u.l |= ((cl_ulong)1022 << 52); + u.l |= s; + + *exp = e; + return u.d; } -long double reference_sqrtl(long double x) -{ - volatile double dx = x; - return sqrt( dx ); +long double reference_frexpl(long double a, int* exp) { + if (isnan(a) || isinf(a) || a == 0.0) { + *exp = 0; + return a; + } + + if (sizeof(long double) == sizeof(double)) { + return reference_frexp(a, exp); + } else { + return frexpl(a, exp); + } } -long double reference_tanhl(long double x) -{ - return tanhl( x ); +double reference_atan(double x) { return atan(x); } + +long double reference_atanl(long double x) { return atanl(x); } + +long double reference_asinl(long double x) { return asinl(x); } + +double reference_asin(double x) { return asin(x); } + +double reference_fabs(double x) { return fabs(x); } + +double reference_cosh(double x) { return cosh(x); } + +long double reference_sqrtl(long double x) { + volatile double dx = x; + return sqrt(dx); } +long double reference_tanhl(long double x) { return tanhl(x); } + long double reference_floorl(long double x) { if( x == 0.0 || reference_isinfl(x) || reference_isnanl(x) ) @@ -5519,11 +5516,7 @@ long double reference_floorl(long double x) return r; } - -double reference_tanh(double x) -{ - return tanh( x ); -} +double reference_tanh(double x) { return tanh(x); } long double reference_assignmentl( long double x ){ return x; } @@ -5534,5 +5527,5 @@ int reference_notl( long double x ) } #if defined( _MSC_VER ) -# pragma warning( pop ) +#pragma warning(pop) #endif