41e8926aa4
This patch helps some math functions performance by adding the libc_fexxx variant of inline functions to handle both FPU round and exception set/restore and by using them on the libc_fexxx_ctx functions. It is based on already coded fexxx family functions for PPC with fpu. Here is the summary of performance improvements due this patch (measured on a POWER7 machine): Before: cos(): ITERS:9.5895e+07: TOTAL:5116.03Mcy, MAX:77.6cy, MIN:49.792cy, 18744 calls/Mcy exp(): ITERS:2.827e+07: TOTAL:5187.15Mcy, MAX:494.018cy, MIN:38.422cy, 5450.01 calls/Mcy pow(): ITERS:6.1705e+07: TOTAL:5144.26Mcy, MAX:171.95cy, MIN:29.935cy, 11994.9 calls/Mcy sin(): ITERS:8.6898e+07: TOTAL:5117.06Mcy, MAX:83.841cy, MIN:46.582cy, 16982 calls/Mcy tan(): ITERS:2.9473e+07: TOTAL:5115.39Mcy, MAX:191.017cy, MIN:172.352cy, 5761.63 calls/Mcy After: cos(): ITERS:2.05265e+08: TOTAL:5111.37Mcy, MAX:78.754cy, MIN:24.196cy, 40158.5 calls/Mcy exp(): ITERS:3.341e+07: TOTAL:5170.84Mcy, MAX:476.317cy, MIN:15.574cy, 6461.23 calls/Mcy pow(): ITERS:7.6153e+07: TOTAL:5129.1Mcy, MAX:147.5cy, MIN:30.916cy, 14847.2 calls/Mcy sin(): ITERS:1.58816e+08: TOTAL:5115.11Mcy, MAX:1490.39cy, MIN:22.341cy, 31048.4 calls/Mcy tan(): ITERS:3.4964e+07: TOTAL:5114.18Mcy, MAX:177.422cy, MIN:146.115cy, 6836.68 calls/Mcy
180 lines
3.9 KiB
C
180 lines
3.9 KiB
C
/* Private inline math functions for powerpc.
|
|
Copyright (C) 2006-2013 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#ifndef _PPC_MATH_PRIVATE_H_
|
|
#define _PPC_MATH_PRIVATE_H_
|
|
|
|
#include <sysdep.h>
|
|
#include <ldsodefs.h>
|
|
#include <dl-procinfo.h>
|
|
#include <fenv_private.h>
|
|
#include_next <math_private.h>
|
|
|
|
# if __WORDSIZE == 64 || defined _ARCH_PWR4
|
|
# define __CPU_HAS_FSQRT 1
|
|
# else
|
|
# define __CPU_HAS_FSQRT ((GLRO(dl_hwcap) & PPC_FEATURE_64) != 0)
|
|
# endif
|
|
|
|
extern double __slow_ieee754_sqrt (double);
|
|
extern __always_inline double
|
|
__ieee754_sqrt (double __x)
|
|
{
|
|
double __z;
|
|
|
|
if (__CPU_HAS_FSQRT)
|
|
{
|
|
/* Volatile is required to prevent the compiler from moving the
|
|
fsqrt instruction above the branch. */
|
|
__asm __volatile ("fsqrt %0,%1" : "=f" (__z) : "f" (__x));
|
|
}
|
|
else
|
|
__z = __slow_ieee754_sqrt(__x);
|
|
|
|
return __z;
|
|
}
|
|
|
|
extern float __slow_ieee754_sqrtf (float);
|
|
extern __always_inline float
|
|
__ieee754_sqrtf (float __x)
|
|
{
|
|
float __z;
|
|
|
|
if (__CPU_HAS_FSQRT)
|
|
{
|
|
/* Volatile is required to prevent the compiler from moving the
|
|
fsqrts instruction above the branch. */
|
|
__asm __volatile ("fsqrts %0,%1" : "=f" (__z) : "f" (__x));
|
|
}
|
|
else
|
|
__z = __slow_ieee754_sqrtf(__x);
|
|
|
|
return __z;
|
|
}
|
|
|
|
#if defined _ARCH_PWR5X
|
|
|
|
# ifndef __round
|
|
# define __round(x) \
|
|
({ double __z; \
|
|
__asm __volatile ( \
|
|
" frin %0,%1\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
# ifndef __roundf
|
|
# define __roundf(x) \
|
|
({ float __z; \
|
|
__asm __volatile ( \
|
|
" frin %0,%1\n" \
|
|
" frsp %0,%0\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
|
|
# ifndef __trunc
|
|
# define __trunc(x) \
|
|
({ double __z; \
|
|
__asm __volatile ( \
|
|
" friz %0,%1\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
# ifndef __truncf
|
|
# define __truncf(x) \
|
|
({ float __z; \
|
|
__asm __volatile ( \
|
|
" friz %0,%1\n" \
|
|
" frsp %0,%0\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
|
|
# ifndef __ceil
|
|
# define __ceil(x) \
|
|
({ double __z; \
|
|
__asm __volatile ( \
|
|
" frip %0,%1\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
# ifndef __ceilf
|
|
# define __ceilf(x) \
|
|
({ float __z; \
|
|
__asm __volatile ( \
|
|
" frip %0,%1\n" \
|
|
" frsp %0,%0\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
|
|
# ifndef __floor
|
|
# define __floor(x) \
|
|
({ double __z; \
|
|
__asm __volatile ( \
|
|
" frim %0,%1\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
# ifndef __floorf
|
|
# define __floorf(x) \
|
|
({ float __z; \
|
|
__asm __volatile ( \
|
|
" frim %0,%1\n" \
|
|
" frsp %0,%0\n" \
|
|
: "=f" (__z) \
|
|
: "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
|
|
#endif /* defined _ARCH_PWR5X */
|
|
|
|
|
|
#if defined _ARCH_PWR6
|
|
|
|
# ifndef __copysign
|
|
# define __copysign(x, y) \
|
|
({ double __z; \
|
|
__asm __volatile ( \
|
|
" fcpsgn %0,%1,%2\n" \
|
|
: "=f" (__z) \
|
|
: "f" (y), "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
# ifndef __copysignf
|
|
# define __copysignf(x, y) \
|
|
({ float __z; \
|
|
__asm __volatile ( \
|
|
" fcpsgn %0,%1,%2\n" \
|
|
" frsp %0,%0\n" \
|
|
: "=f" (__z) \
|
|
: "f" (y), "f" (x)); \
|
|
__z; })
|
|
# endif
|
|
|
|
#endif /* defined _ARCH_PWR6 */
|
|
|
|
#endif /* _PPC_MATH_PRIVATE_H_ */
|