6ace393821
Similar to various other bugs in this area, pow functions can fail to raise the underflow exception when the result is tiny and inexact but one or more low bits of the intermediate result that is scaled down (or, in the i386 case, converted from a wider evaluation format) are zero. This patch forces the exception in a similar way to previous fixes, thereby concluding the fixes for known bugs with missing underflow exceptions currently filed in Bugzilla. Tested for x86_64, x86, mips64 and powerpc. [BZ #18825] * sysdeps/i386/fpu/i386-math-asm.h (FLT_NARROW_EVAL_UFLOW_NONNAN): New macro. (DBL_NARROW_EVAL_UFLOW_NONNAN): Likewise. (LDBL_CHECK_FORCE_UFLOW_NONNAN): Likewise. * sysdeps/i386/fpu/e_pow.S: Use DEFINE_DBL_MIN. (__ieee754_pow): Use DBL_NARROW_EVAL_UFLOW_NONNAN instead of DBL_NARROW_EVAL, reloading the PIC register as needed. * sysdeps/i386/fpu/e_powf.S: Use DEFINE_FLT_MIN. (__ieee754_powf): Use FLT_NARROW_EVAL_UFLOW_NONNAN instead of FLT_NARROW_EVAL. Use separate return path for case when first argument is NaN. * sysdeps/i386/fpu/e_powl.S: Include <i386-math-asm.h>. Use DEFINE_LDBL_MIN. (__ieee754_powl): Use LDBL_CHECK_FORCE_UFLOW_NONNAN, reloading the PIC register. * sysdeps/ieee754/dbl-64/e_pow.c (__ieee754_pow): Use math_check_force_underflow_nonneg. * sysdeps/ieee754/flt-32/e_powf.c (__ieee754_powf): Force underflow for subnormal result. * sysdeps/ieee754/ldbl-128/e_powl.c (__ieee754_powl): Likewise. * sysdeps/ieee754/ldbl-128ibm/e_powl.c (__ieee754_powl): Use math_check_force_underflow_nonneg. * sysdeps/x86/fpu/powl_helper.c (__powl_helper): Use math_check_force_underflow. * sysdeps/x86_64/fpu/x86_64-math-asm.h (LDBL_CHECK_FORCE_UFLOW_NONNAN): New macro. * sysdeps/x86_64/fpu/e_powl.S: Include <x86_64-math-asm.h>. Use DEFINE_LDBL_MIN. (__ieee754_powl): Use LDBL_CHECK_FORCE_UFLOW_NONNAN. * math/auto-libm-test-in: Add more tests of pow. * math/auto-libm-test-out: Regenerated.
237 lines
8.3 KiB
C
237 lines
8.3 KiB
C
/* Implement powl for x86 using extra-precision log.
|
|
Copyright (C) 2012-2015 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <math.h>
|
|
#include <math_private.h>
|
|
#include <stdbool.h>
|
|
|
|
/* High parts and low parts of -log (k/16), for integer k from 12 to
|
|
24. */
|
|
|
|
static const long double powl_log_table[] =
|
|
{
|
|
0x4.9a58844d36e49e1p-4L, -0x1.0522624fd558f574p-68L,
|
|
0x3.527da7915b3c6de4p-4L, 0x1.7d4ef4b901b99b9ep-68L,
|
|
0x2.22f1d044fc8f7bc8p-4L, -0x1.8e97c071a42fc388p-68L,
|
|
0x1.08598b59e3a0688ap-4L, 0x3.fd9bf503372c12fcp-72L,
|
|
-0x0p+0L, 0x0p+0L,
|
|
-0xf.85186008b15330cp-8L, 0x1.9b47488a6687672cp-72L,
|
|
-0x1.e27076e2af2e5e9ep-4L, -0xa.87ffe1fe9e155dcp-72L,
|
|
-0x2.bfe60e14f27a791p-4L, 0x1.83bebf1bdb88a032p-68L,
|
|
-0x3.91fef8f353443584p-4L, -0xb.b03de5ff734495cp-72L,
|
|
-0x4.59d72aeae98380e8p-4L, 0xc.e0aa3be4747dc1p-72L,
|
|
-0x5.1862f08717b09f4p-4L, -0x2.decdeccf1cd10578p-68L,
|
|
-0x5.ce75fdaef401a738p-4L, -0x9.314feb4fbde5aaep-72L,
|
|
-0x6.7cc8fb2fe612fcbp-4L, 0x2.5ca2642feb779f98p-68L,
|
|
};
|
|
|
|
/* High 32 bits of log2 (e), and remainder rounded to 64 bits. */
|
|
static const long double log2e_hi = 0x1.71547652p+0L;
|
|
static const long double log2e_lo = 0xb.82fe1777d0ffda1p-36L;
|
|
|
|
/* Given a number with high part HI and low part LO, add the number X
|
|
to it and store the result in *RHI and *RLO. It is given that
|
|
either |X| < |0.7 * HI|, or HI == LO == 0, and that the values are
|
|
small enough that no overflow occurs. The result does not need to
|
|
be exact to 128 bits; 78-bit accuracy of the final accumulated
|
|
result suffices. */
|
|
|
|
static inline void
|
|
acc_split (long double *rhi, long double *rlo, long double hi, long double lo,
|
|
long double x)
|
|
{
|
|
long double thi = hi + x;
|
|
long double tlo = (hi - thi) + x + lo;
|
|
*rhi = thi + tlo;
|
|
*rlo = (thi - *rhi) + tlo;
|
|
}
|
|
|
|
extern long double __powl_helper (long double x, long double y);
|
|
libm_hidden_proto (__powl_helper)
|
|
|
|
/* Given X a value that is finite and nonzero, or a NaN, and Y a
|
|
finite nonzero value with 0x1p-79 <= |Y| <= 0x1p78, compute X to
|
|
the power Y. */
|
|
|
|
long double
|
|
__powl_helper (long double x, long double y)
|
|
{
|
|
if (isnan (x))
|
|
return __ieee754_expl (y * __ieee754_logl (x));
|
|
bool negate;
|
|
if (x < 0)
|
|
{
|
|
long double absy = fabsl (y);
|
|
if (absy >= 0x1p64L)
|
|
negate = false;
|
|
else
|
|
{
|
|
unsigned long long yll = absy;
|
|
if (yll != absy)
|
|
return __ieee754_expl (y * __ieee754_logl (x));
|
|
negate = (yll & 1) != 0;
|
|
}
|
|
x = fabsl (x);
|
|
}
|
|
else
|
|
negate = false;
|
|
|
|
/* We need to compute Y * log2 (X) to at least 64 bits after the
|
|
point for normal results (that is, to at least 78 bits
|
|
precision). */
|
|
int x_int_exponent;
|
|
long double x_frac;
|
|
x_frac = __frexpl (x, &x_int_exponent);
|
|
if (x_frac <= 0x0.aaaaaaaaaaaaaaaap0L) /* 2.0L / 3.0L, rounded down */
|
|
{
|
|
x_frac *= 2.0;
|
|
x_int_exponent--;
|
|
}
|
|
|
|
long double log_x_frac_hi, log_x_frac_lo;
|
|
/* Determine an initial approximation to log (X_FRAC) using
|
|
POWL_LOG_TABLE, and multiply by a value K/16 to reduce to an
|
|
interval (24/25, 26/25). */
|
|
int k = (int) ((16.0L / x_frac) + 0.5L);
|
|
log_x_frac_hi = powl_log_table[2 * k - 24];
|
|
log_x_frac_lo = powl_log_table[2 * k - 23];
|
|
long double x_frac_low;
|
|
if (k == 16)
|
|
x_frac_low = 0.0L;
|
|
else
|
|
{
|
|
/* Mask off low 5 bits of X_FRAC so the multiplication by K/16
|
|
is exact. These bits are small enough that they can be
|
|
corrected for by adding log2 (e) * X_FRAC_LOW to the final
|
|
result. */
|
|
int32_t se;
|
|
u_int32_t i0, i1;
|
|
GET_LDOUBLE_WORDS (se, i0, i1, x_frac);
|
|
x_frac_low = x_frac;
|
|
i1 &= 0xffffffe0;
|
|
SET_LDOUBLE_WORDS (x_frac, se, i0, i1);
|
|
x_frac_low -= x_frac;
|
|
x_frac_low /= x_frac;
|
|
x_frac *= k / 16.0L;
|
|
}
|
|
|
|
/* Now compute log (X_FRAC) for X_FRAC in (24/25, 26/25). Separate
|
|
W = X_FRAC - 1 into high 16 bits and remaining bits, so that
|
|
multiplications for low-order power series terms are exact. The
|
|
remaining bits are small enough that adding a 64-bit value of
|
|
log2 (1 + W_LO / (1 + W_HI)) will be a sufficient correction for
|
|
them. */
|
|
long double w = x_frac - 1;
|
|
long double w_hi, w_lo;
|
|
int32_t se;
|
|
u_int32_t i0, i1;
|
|
GET_LDOUBLE_WORDS (se, i0, i1, w);
|
|
i0 &= 0xffff0000;
|
|
i1 = 0;
|
|
SET_LDOUBLE_WORDS (w_hi, se, i0, i1);
|
|
w_lo = w - w_hi;
|
|
long double wp = w_hi;
|
|
acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo, wp);
|
|
wp *= -w_hi;
|
|
acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
|
|
wp / 2.0L);
|
|
wp *= -w_hi;
|
|
acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
|
|
wp * 0x0.5555p0L); /* -W_HI**3 / 3, high part. */
|
|
acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
|
|
wp * 0x0.5555555555555555p-16L); /* -W_HI**3 / 3, low part. */
|
|
wp *= -w_hi;
|
|
acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
|
|
wp / 4.0L);
|
|
/* Subsequent terms are small enough that they only need be computed
|
|
to 64 bits. */
|
|
for (int i = 5; i <= 17; i++)
|
|
{
|
|
wp *= -w_hi;
|
|
acc_split (&log_x_frac_hi, &log_x_frac_lo, log_x_frac_hi, log_x_frac_lo,
|
|
wp / i);
|
|
}
|
|
|
|
/* Convert LOG_X_FRAC_HI + LOG_X_FRAC_LO to a base-2 logarithm. */
|
|
long double log2_x_frac_hi, log2_x_frac_lo;
|
|
long double log_x_frac_hi32, log_x_frac_lo64;
|
|
GET_LDOUBLE_WORDS (se, i0, i1, log_x_frac_hi);
|
|
i1 = 0;
|
|
SET_LDOUBLE_WORDS (log_x_frac_hi32, se, i0, i1);
|
|
log_x_frac_lo64 = (log_x_frac_hi - log_x_frac_hi32) + log_x_frac_lo;
|
|
long double log2_x_frac_hi1 = log_x_frac_hi32 * log2e_hi;
|
|
long double log2_x_frac_lo1
|
|
= log_x_frac_lo64 * log2e_hi + log_x_frac_hi * log2e_lo;
|
|
log2_x_frac_hi = log2_x_frac_hi1 + log2_x_frac_lo1;
|
|
log2_x_frac_lo = (log2_x_frac_hi1 - log2_x_frac_hi) + log2_x_frac_lo1;
|
|
|
|
/* Correct for the masking off of W_LO. */
|
|
long double log2_1p_w_lo;
|
|
asm ("fyl2xp1"
|
|
: "=t" (log2_1p_w_lo)
|
|
: "0" (w_lo / (1.0L + w_hi)), "u" (1.0L)
|
|
: "st(1)");
|
|
acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
|
|
log2_1p_w_lo);
|
|
|
|
/* Correct for the masking off of X_FRAC_LOW. */
|
|
acc_split (&log2_x_frac_hi, &log2_x_frac_lo, log2_x_frac_hi, log2_x_frac_lo,
|
|
x_frac_low * M_LOG2El);
|
|
|
|
/* Add the integer and fractional parts of the base-2 logarithm. */
|
|
long double log2_x_hi, log2_x_lo;
|
|
log2_x_hi = x_int_exponent + log2_x_frac_hi;
|
|
log2_x_lo = ((x_int_exponent - log2_x_hi) + log2_x_frac_hi) + log2_x_frac_lo;
|
|
|
|
/* Compute the base-2 logarithm of the result. */
|
|
long double log2_res_hi, log2_res_lo;
|
|
long double log2_x_hi32, log2_x_lo64;
|
|
GET_LDOUBLE_WORDS (se, i0, i1, log2_x_hi);
|
|
i1 = 0;
|
|
SET_LDOUBLE_WORDS (log2_x_hi32, se, i0, i1);
|
|
log2_x_lo64 = (log2_x_hi - log2_x_hi32) + log2_x_lo;
|
|
long double y_hi32, y_lo32;
|
|
GET_LDOUBLE_WORDS (se, i0, i1, y);
|
|
i1 = 0;
|
|
SET_LDOUBLE_WORDS (y_hi32, se, i0, i1);
|
|
y_lo32 = y - y_hi32;
|
|
log2_res_hi = log2_x_hi32 * y_hi32;
|
|
log2_res_lo = log2_x_hi32 * y_lo32 + log2_x_lo64 * y;
|
|
|
|
/* Split the base-2 logarithm of the result into integer and
|
|
fractional parts. */
|
|
long double log2_res_int = __roundl (log2_res_hi);
|
|
long double log2_res_frac = log2_res_hi - log2_res_int + log2_res_lo;
|
|
/* If the integer part is very large, the computed fractional part
|
|
may be outside the valid range for f2xm1. */
|
|
if (fabsl (log2_res_int) > 16500)
|
|
log2_res_frac = 0;
|
|
|
|
/* Compute the final result. */
|
|
long double res;
|
|
asm ("f2xm1" : "=t" (res) : "0" (log2_res_frac));
|
|
res += 1.0L;
|
|
if (negate)
|
|
res = -res;
|
|
asm ("fscale" : "=t" (res) : "0" (res), "u" (log2_res_int));
|
|
math_check_force_underflow (res);
|
|
return res;
|
|
}
|
|
|
|
libm_hidden_def (__powl_helper)
|