Add SSE4.2 support for strcspn, strpbrk, and strspn on x86-64.
This commit is contained in:
parent
241e680320
commit
06e51c8f3d
16
ChangeLog
16
ChangeLog
@ -1,3 +1,19 @@
|
||||
2009-07-02 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* config.h.in (HAVE_SSE4_SUPPORT): New macro.
|
||||
* config.make.in (config-cflags-sse4): New variable.
|
||||
* configure.in: Substitute libc_cv_cc_sse4.
|
||||
* sysdeps/i386/configure.in: Set libc_cv_cc_sse4 and
|
||||
HAVE_SSE4_SUPPORT.
|
||||
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
|
||||
strcspn-c, strpbrk-c, strspn-c for string if gcc supports SSE4.
|
||||
* sysdeps/x86_64/multiarch/strcspn-c.c: New file.
|
||||
* sysdeps/x86_64/multiarch/strcspn.S: New file.
|
||||
* sysdeps/x86_64/multiarch/strpbrk-c.c: New file.
|
||||
* sysdeps/x86_64/multiarch/strpbrk.S: New file.
|
||||
* sysdeps/x86_64/multiarch/strspn-c.c: New file.
|
||||
* sysdeps/x86_64/multiarch/strspn.S: New file.
|
||||
|
||||
2009-06-30 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* elf/Makefile (distribute): Remove tst-audit.sh. Add
|
||||
|
@ -129,6 +129,9 @@
|
||||
/* Define if binutils support TLS handling. */
|
||||
#undef HAVE_TLS_SUPPORT
|
||||
|
||||
/* Define if gcc supports SSE4. */
|
||||
#undef HAVE_SSE4_SUPPORT
|
||||
|
||||
/* Define if the compiler's exception support is based on libunwind. */
|
||||
#undef HAVE_CC_WITH_LIBUNWIND
|
||||
|
||||
|
@ -34,6 +34,8 @@ config-sysdirs = @sysnames@
|
||||
cflags-cpu = @libc_cv_cc_submachine@
|
||||
asflags-cpu = @libc_cv_cc_submachine@
|
||||
|
||||
config-cflags-sse4 = @libc_cv_cc_sse4@
|
||||
|
||||
defines = @DEFINES@
|
||||
sysincludes = @SYSINCLUDES@
|
||||
c++-sysincludes = @CXX_SYSINCLUDES@
|
||||
|
2
configure
vendored
2
configure
vendored
@ -657,6 +657,7 @@ xcoff
|
||||
elf
|
||||
ldd_rewrite_script
|
||||
use_ldconfig
|
||||
libc_cv_cc_sse4
|
||||
libc_cv_cpp_asm_debuginfo
|
||||
libc_cv_forced_unwind
|
||||
libc_cv_rootsbindir
|
||||
@ -8744,6 +8745,7 @@ fi
|
||||
|
||||
|
||||
|
||||
|
||||
if test $elf = yes; then
|
||||
cat >>confdefs.h <<\_ACEOF
|
||||
#define HAVE_ELF 1
|
||||
|
@ -2259,6 +2259,7 @@ AC_SUBST(libc_cv_forced_unwind)
|
||||
|
||||
dnl sysdeps/CPU/configure.in checks set this via arch-specific asm tests
|
||||
AC_SUBST(libc_cv_cpp_asm_debuginfo)
|
||||
AC_SUBST(libc_cv_cc_sse4)
|
||||
|
||||
AC_SUBST(use_ldconfig)
|
||||
AC_SUBST(ldd_rewrite_script)
|
||||
|
71
sysdeps/i386/configure
vendored
71
sysdeps/i386/configure
vendored
@ -1,10 +1,42 @@
|
||||
as_nl='
|
||||
'
|
||||
export as_nl
|
||||
# Printing a long string crashes Solaris 7 /usr/bin/printf.
|
||||
as_echo='\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\\'
|
||||
as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo
|
||||
as_echo=$as_echo$as_echo$as_echo$as_echo$as_echo$as_echo
|
||||
if (test "X`printf %s $as_echo`" = "X$as_echo") 2>/dev/null; then
|
||||
as_echo='printf %s\n'
|
||||
as_echo_n='printf %s'
|
||||
else
|
||||
if test "X`(/usr/ucb/echo -n -n $as_echo) 2>/dev/null`" = "X-n $as_echo"; then
|
||||
as_echo_body='eval /usr/ucb/echo -n "$1$as_nl"'
|
||||
as_echo_n='/usr/ucb/echo -n'
|
||||
else
|
||||
as_echo_body='eval expr "X$1" : "X\\(.*\\)"'
|
||||
as_echo_n_body='eval
|
||||
arg=$1;
|
||||
case $arg in
|
||||
*"$as_nl"*)
|
||||
expr "X$arg" : "X\\(.*\\)$as_nl";
|
||||
arg=`expr "X$arg" : ".*$as_nl\\(.*\\)"`;;
|
||||
esac;
|
||||
expr "X$arg" : "X\\(.*\\)" | tr -d "$as_nl"
|
||||
'
|
||||
export as_echo_n_body
|
||||
as_echo_n='sh -c $as_echo_n_body as_echo'
|
||||
fi
|
||||
export as_echo_body
|
||||
as_echo='sh -c $as_echo_body as_echo'
|
||||
fi
|
||||
|
||||
# This file is generated from configure.in by Autoconf. DO NOT EDIT!
|
||||
# Local configure fragment for sysdeps/i386.
|
||||
|
||||
echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
|
||||
echo $ECHO_N "checking if -g produces usable source locations for assembler-with-cpp... $ECHO_C" >&6
|
||||
{ $as_echo "$as_me:$LINENO: checking if -g produces usable source locations for assembler-with-cpp" >&5
|
||||
$as_echo_n "checking if -g produces usable source locations for assembler-with-cpp... " >&6; }
|
||||
if test "${libc_cv_cpp_asm_debuginfo+set}" = set; then
|
||||
echo $ECHO_N "(cached) $ECHO_C" >&6
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
cat > conftest.S <<EOF
|
||||
#include "confdefs.h"
|
||||
@ -27,7 +59,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5'
|
||||
{ (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
|
||||
(eval $ac_try) 2>&5
|
||||
ac_status=$?
|
||||
echo "$as_me:$LINENO: \$? = $ac_status" >&5
|
||||
$as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
|
||||
(exit $ac_status); }; } && {
|
||||
ac_pattern='conftest\.S'
|
||||
{ ac_try='readelf --debug-dump=line conftest.o |
|
||||
@ -35,7 +67,7 @@ if { ac_try='${CC-cc} $CPPFLAGS $ASFLAGS -g -c conftest.S 1>&5'
|
||||
{ (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
|
||||
(eval $ac_try) 2>&5
|
||||
ac_status=$?
|
||||
echo "$as_me:$LINENO: \$? = $ac_status" >&5
|
||||
$as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
|
||||
(exit $ac_status); }; }
|
||||
}; then
|
||||
libc_cv_cpp_asm_debuginfo=yes
|
||||
@ -44,11 +76,36 @@ else
|
||||
fi
|
||||
rm -f conftest*
|
||||
fi
|
||||
echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
|
||||
echo "${ECHO_T}$libc_cv_cpp_asm_debuginfo" >&6
|
||||
{ $as_echo "$as_me:$LINENO: result: $libc_cv_cpp_asm_debuginfo" >&5
|
||||
$as_echo "$libc_cv_cpp_asm_debuginfo" >&6; }
|
||||
if test $libc_cv_cpp_asm_debuginfo = yes; then
|
||||
cat >>confdefs.h <<\_ACEOF
|
||||
#define HAVE_CPP_ASM_DEBUGINFO 1
|
||||
_ACEOF
|
||||
|
||||
fi
|
||||
|
||||
{ $as_echo "$as_me:$LINENO: checking for SSE4 support" >&5
|
||||
$as_echo_n "checking for SSE4 support... " >&6; }
|
||||
if test "${libc_cv_cc_sse4+set}" = set; then
|
||||
$as_echo_n "(cached) " >&6
|
||||
else
|
||||
if { ac_try='${CC-cc} -msse4 -xc /dev/null -S -o /dev/null'
|
||||
{ (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
|
||||
(eval $ac_try) 2>&5
|
||||
ac_status=$?
|
||||
$as_echo "$as_me:$LINENO: \$? = $ac_status" >&5
|
||||
(exit $ac_status); }; }; then
|
||||
libc_cv_cc_sse4=yes
|
||||
else
|
||||
libc_cv_cc_sse4=no
|
||||
fi
|
||||
fi
|
||||
{ $as_echo "$as_me:$LINENO: result: $libc_cv_cc_sse4" >&5
|
||||
$as_echo "$libc_cv_cc_sse4" >&6; }
|
||||
if test $libc_cv_cc_sse4 = yes; then
|
||||
cat >>confdefs.h <<\_ACEOF
|
||||
#define HAVE_SSE4_SUPPORT 1
|
||||
_ACEOF
|
||||
|
||||
fi
|
||||
|
@ -33,3 +33,14 @@ rm -f conftest*])AC_SUBST(libc_cv_cpp_asm_debuginfo)
|
||||
if test $libc_cv_cpp_asm_debuginfo = yes; then
|
||||
AC_DEFINE(HAVE_CPP_ASM_DEBUGINFO)
|
||||
fi
|
||||
|
||||
dnl Check if -msse4 works.
|
||||
AC_CACHE_CHECK(for SSE4 support, libc_cv_cc_sse4, [dnl
|
||||
if AC_TRY_COMMAND([${CC-cc} -msse4 -xc /dev/null -S -o /dev/null]); then
|
||||
libc_cv_cc_sse4=yes
|
||||
else
|
||||
libc_cv_cc_sse4=no
|
||||
fi])
|
||||
if test $libc_cv_cc_sse4 = yes; then
|
||||
AC_DEFINE(HAVE_SSE4_SUPPORT)
|
||||
fi
|
||||
|
@ -5,4 +5,10 @@ endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += stpncpy-c strncpy-c strncmp-c
|
||||
ifeq (yes,$(config-cflags-sse4))
|
||||
sysdep_routines += strcspn-c strpbrk-c strspn-c
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
CFLAGS-strspn-c.c += -msse4
|
||||
endif
|
||||
endif
|
||||
|
331
sysdeps/x86_64/multiarch/strcspn-c.c
Normal file
331
sysdeps/x86_64/multiarch/strcspn-c.c
Normal file
@ -0,0 +1,331 @@
|
||||
/* strcspn with SSE4.2 intrinsics
|
||||
Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <nmmintrin.h>
|
||||
#include <string.h>
|
||||
|
||||
/* We use 0x2:
|
||||
_SIDD_SBYTE_OPS
|
||||
| _SIDD_CMP_EQUAL_ANY
|
||||
| _SIDD_POSITIVE_POLARITY
|
||||
| _SIDD_LEAST_SIGNIFICANT
|
||||
on pcmpistri to compare xmm/mem128
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
X X X X X X X X X X X X X X X X
|
||||
|
||||
against xmm
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
A A A A A A A A A A A A A A A A
|
||||
|
||||
to find out if the first 16byte data element has any byte A and
|
||||
the offset of the first byte. There are 3 cases:
|
||||
|
||||
1. The first 16byte data element has the byte A at the offset X.
|
||||
2. The first 16byte data element has EOS and doesn't have the byte A.
|
||||
3. The first 16byte data element is valid and doesn't have the byte A.
|
||||
|
||||
Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
|
||||
|
||||
1 X 1 0/1 0
|
||||
2 16 0 1 0
|
||||
3 16 0 0 0
|
||||
|
||||
We exit from the loop for cases 1 and 2 with jbe which branches
|
||||
when either CFlag or ZFlag is 1. If CFlag == 1, ECX has the offset
|
||||
X for case 1. */
|
||||
|
||||
#ifndef STRCSPN_SSE2
|
||||
#define STRCSPN_SSE2 __strcspn_sse2
|
||||
#define STRCSPN_SSE42 __strcspn_sse42
|
||||
#endif
|
||||
|
||||
extern
|
||||
#ifdef USE_AS_STRPBRK
|
||||
char *
|
||||
#else
|
||||
size_t
|
||||
#endif
|
||||
STRCSPN_SSE2 (const char *, const char *);
|
||||
|
||||
#ifdef USE_AS_STRPBRK
|
||||
char *
|
||||
#else
|
||||
size_t
|
||||
#endif
|
||||
__attribute__ ((section (".text.sse4.2")))
|
||||
STRCSPN_SSE42 (const char *s, const char *a)
|
||||
{
|
||||
int offset;
|
||||
const char *aligned;
|
||||
__m128i mask, mask0, mask1;
|
||||
__m128i value;
|
||||
int index, length;
|
||||
int cflag, zflag;
|
||||
|
||||
if (*a == 0)
|
||||
#ifdef USE_AS_STRPBRK
|
||||
return NULL;
|
||||
#else
|
||||
return strlen (s);
|
||||
#endif
|
||||
|
||||
offset = (int) ((size_t) a & 15);
|
||||
if (offset != 0)
|
||||
{
|
||||
/* Load masks. */
|
||||
aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
|
||||
mask0 = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_srli_si128 (mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_srli_si128 (mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_srli_si128 (mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_srli_si128 (mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_srli_si128 (mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_srli_si128 (mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_srli_si128 (mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_srli_si128 (mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_srli_si128 (mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_srli_si128 (mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_srli_si128 (mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_srli_si128 (mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_srli_si128 (mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_srli_si128 (mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_srli_si128 (mask0, 15);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
if (length == 16 - offset)
|
||||
{
|
||||
/* There is no NULL terminator. */
|
||||
mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
|
||||
index = _mm_cmpistri (mask1, mask1, 0x3a);
|
||||
length += index;
|
||||
|
||||
/* Don't use SSE4.2 if the length of A > 16. */
|
||||
if (length > 16)
|
||||
return STRCSPN_SSE2 (s, a);
|
||||
|
||||
if (index != 0)
|
||||
{
|
||||
/* Combine mask0 and mask1. */
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 15);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A is aligned. */
|
||||
mask = _mm_load_si128 ((__m128i *) a);
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
if (length == 16)
|
||||
{
|
||||
/* There is no NULL terminator. Don't use SSE4.2 if the length
|
||||
of A > 16. */
|
||||
if (a[16] != 0)
|
||||
return STRCSPN_SSE2 (s, a);
|
||||
}
|
||||
}
|
||||
|
||||
offset = (int) ((size_t) s & 15);
|
||||
if (offset != 0)
|
||||
{
|
||||
/* Check partial string. */
|
||||
aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
|
||||
value = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
value = _mm_srli_si128 (value, 1);
|
||||
break;
|
||||
case 2:
|
||||
value = _mm_srli_si128 (value, 2);
|
||||
break;
|
||||
case 3:
|
||||
value = _mm_srli_si128 (value, 3);
|
||||
break;
|
||||
case 4:
|
||||
value = _mm_srli_si128 (value, 4);
|
||||
break;
|
||||
case 5:
|
||||
value = _mm_srli_si128 (value, 5);
|
||||
break;
|
||||
case 6:
|
||||
value = _mm_srli_si128 (value, 6);
|
||||
break;
|
||||
case 7:
|
||||
value = _mm_srli_si128 (value, 7);
|
||||
break;
|
||||
case 8:
|
||||
value = _mm_srli_si128 (value, 8);
|
||||
break;
|
||||
case 9:
|
||||
value = _mm_srli_si128 (value, 9);
|
||||
break;
|
||||
case 10:
|
||||
value = _mm_srli_si128 (value, 10);
|
||||
break;
|
||||
case 11:
|
||||
value = _mm_srli_si128 (value, 11);
|
||||
break;
|
||||
case 12:
|
||||
value = _mm_srli_si128 (value, 12);
|
||||
break;
|
||||
case 13:
|
||||
value = _mm_srli_si128 (value, 13);
|
||||
break;
|
||||
case 14:
|
||||
value = _mm_srli_si128 (value, 14);
|
||||
break;
|
||||
case 15:
|
||||
value = _mm_srli_si128 (value, 15);
|
||||
break;
|
||||
}
|
||||
|
||||
length = _mm_cmpistri (mask, value, 0x2);
|
||||
/* No need to check ZFlag since ZFlag is always 1. */
|
||||
cflag = _mm_cmpistrc (mask, value, 0x2);
|
||||
if (cflag)
|
||||
#ifdef USE_AS_STRPBRK
|
||||
return (char *) (s + length);
|
||||
#else
|
||||
return length;
|
||||
#endif
|
||||
/* Find where the NULL terminator is. */
|
||||
index = _mm_cmpistri (value, value, 0x3a);
|
||||
if (index < 16 - offset)
|
||||
#ifdef USE_AS_STRPBRK
|
||||
return NULL;
|
||||
#else
|
||||
return index;
|
||||
#endif
|
||||
aligned += 16;
|
||||
}
|
||||
else
|
||||
aligned = s;
|
||||
|
||||
loop:
|
||||
value = _mm_load_si128 ((__m128i *) aligned);
|
||||
index = _mm_cmpistri (mask, value, 0x2);
|
||||
cflag = _mm_cmpistrc (mask, value, 0x2);
|
||||
zflag = _mm_cmpistrz (mask, value, 0x2);
|
||||
if (cflag)
|
||||
#ifdef USE_AS_STRPBRK
|
||||
return (char *) (aligned + index);
|
||||
#else
|
||||
return (size_t) (aligned + index - s);
|
||||
#endif
|
||||
if (zflag)
|
||||
#ifdef USE_AS_STRPBRK
|
||||
return NULL;
|
||||
#else
|
||||
{
|
||||
/* Find where the NULL terminator is. */
|
||||
index = _mm_cmpistri (value, value, 0x3a);
|
||||
return (size_t) (aligned + index - s);
|
||||
}
|
||||
#endif
|
||||
aligned += 16;
|
||||
goto loop;
|
||||
}
|
82
sysdeps/x86_64/multiarch/strcspn.S
Normal file
82
sysdeps/x86_64/multiarch/strcspn.S
Normal file
@ -0,0 +1,82 @@
|
||||
/* Multiple versions of strcspn
|
||||
Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#ifdef HAVE_SSE4_SUPPORT
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <ifunc-defines.h>
|
||||
|
||||
#ifdef USE_AS_STRPBRK
|
||||
#define STRCSPN_SSE42 __strpbrk_sse42
|
||||
#define STRCSPN_SSE2 __strpbrk_sse2
|
||||
#define __GI_STRCSPN __GI_strpbrk
|
||||
#else
|
||||
#ifndef STRCSPN
|
||||
#define STRCSPN strcspn
|
||||
#define STRCSPN_SSE42 __strcspn_sse42
|
||||
#define STRCSPN_SSE2 __strcspn_sse2
|
||||
#define __GI_STRCSPN __GI_strcspn
|
||||
#endif
|
||||
#endif
|
||||
|
||||
/* Define multiple versions only for the definition in libc. Don't
|
||||
define multiple versions for strpbrk in static library since we
|
||||
need strpbrk before the initialization happened. */
|
||||
#if (defined SHARED || !defined USE_AS_STRPBRK) && !defined NOT_IN_libc
|
||||
.text
|
||||
ENTRY(STRCSPN)
|
||||
.type STRCSPN, @gnu_indirect_function
|
||||
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leaq STRCSPN_SSE2(%rip), %rax
|
||||
testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
|
||||
jz 2f
|
||||
leaq STRCSPN_SSE42(%rip), %rax
|
||||
2: ret
|
||||
END(STRCSPN)
|
||||
|
||||
# undef ENTRY
|
||||
# define ENTRY(name) \
|
||||
.type STRCSPN_SSE2, @function; \
|
||||
.globl STRCSPN_SSE2; \
|
||||
.align 16; \
|
||||
STRCSPN_SSE2: cfi_startproc; \
|
||||
CALL_MCOUNT
|
||||
# undef END
|
||||
# define END(name) \
|
||||
cfi_endproc; .size STRCSPN_SSE2, .-STRCSPN_SSE2
|
||||
# undef libc_hidden_builtin_def
|
||||
/* It doesn't make sense to send libc-internal strcspn calls through a PLT.
|
||||
The speedup we get from using SSE4.2 instruction is likely eaten away
|
||||
by the indirect call in the PLT. */
|
||||
# define libc_hidden_builtin_def(name) \
|
||||
.globl __GI_STRCSPN; __GI_STRCSPN = STRCSPN_SSE2
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_SSE4_SUPPORT */
|
||||
|
||||
#ifdef USE_AS_STRPBRK
|
||||
#include "../strpbrk.S"
|
||||
#else
|
||||
#include "../strcspn.S"
|
||||
#endif
|
4
sysdeps/x86_64/multiarch/strpbrk-c.c
Normal file
4
sysdeps/x86_64/multiarch/strpbrk-c.c
Normal file
@ -0,0 +1,4 @@
|
||||
#define USE_AS_STRPBRK
|
||||
#define STRCSPN_SSE2 __strpbrk_sse2
|
||||
#define STRCSPN_SSE42 __strpbrk_sse42
|
||||
#include "strcspn-c.c"
|
3
sysdeps/x86_64/multiarch/strpbrk.S
Normal file
3
sysdeps/x86_64/multiarch/strpbrk.S
Normal file
@ -0,0 +1,3 @@
|
||||
#define STRCSPN strpbrk
|
||||
#define USE_AS_STRPBRK
|
||||
#include "strcspn.S"
|
287
sysdeps/x86_64/multiarch/strspn-c.c
Normal file
287
sysdeps/x86_64/multiarch/strspn-c.c
Normal file
@ -0,0 +1,287 @@
|
||||
/* strspn with SSE4.2 intrinsics
|
||||
Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <nmmintrin.h>
|
||||
#include <string.h>
|
||||
|
||||
/* We use 0x12:
|
||||
_SIDD_SBYTE_OPS
|
||||
| _SIDD_CMP_EQUAL_ANY
|
||||
| _SIDD_NEGATIVE_POLARITY
|
||||
| _SIDD_LEAST_SIGNIFICANT
|
||||
on pcmpistri to compare xmm/mem128
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
X X X X X X X X X X X X X X X X
|
||||
|
||||
against xmm
|
||||
|
||||
0 1 2 3 4 5 6 7 8 9 A B C D E F
|
||||
A A A A A A A A A A A A A A A A
|
||||
|
||||
to find out if the first 16byte data element has any non-A byte and
|
||||
the offset of the first byte. There are 2 cases:
|
||||
|
||||
1. The first 16byte data element has the non-A byte, including
|
||||
EOS, at the offset X.
|
||||
2. The first 16byte data element is valid and doesn't have the non-A
|
||||
byte.
|
||||
|
||||
Here is the table of ECX, CFlag, ZFlag and SFlag for 2 cases:
|
||||
|
||||
case ECX CFlag ZFlag SFlag
|
||||
1 X 1 0/1 0
|
||||
2 16 0 0 0
|
||||
|
||||
We exit from the loop for case 1. */
|
||||
|
||||
extern size_t __strspn_sse2 (const char *, const char *);
|
||||
|
||||
size_t
|
||||
__attribute__ ((section (".text.sse4.2")))
|
||||
__strspn_sse42 (const char *s, const char *a)
|
||||
{
|
||||
int offset;
|
||||
const char *aligned;
|
||||
__m128i mask, mask0, mask1;
|
||||
__m128i value;
|
||||
int index, length;
|
||||
int cflag;
|
||||
|
||||
if (*a == 0)
|
||||
return 0;
|
||||
|
||||
offset = (int) ((size_t) a & 15);
|
||||
if (offset != 0)
|
||||
{
|
||||
/* Load masks. */
|
||||
aligned = (const char *) ((size_t) a & 0xfffffffffffffff0L);
|
||||
mask0 = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_srli_si128 (mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_srli_si128 (mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_srli_si128 (mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_srli_si128 (mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_srli_si128 (mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_srli_si128 (mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_srli_si128 (mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_srli_si128 (mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_srli_si128 (mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_srli_si128 (mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_srli_si128 (mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_srli_si128 (mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_srli_si128 (mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_srli_si128 (mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_srli_si128 (mask0, 15);
|
||||
break;
|
||||
}
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
if (length == 16 - offset)
|
||||
{
|
||||
/* There is no NULL terminator. */
|
||||
mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
|
||||
index = _mm_cmpistri (mask1, mask1, 0x3a);
|
||||
length += index;
|
||||
|
||||
/* Don't use SSE4.2 if the length of A > 16. */
|
||||
if (length > 16)
|
||||
return __strspn_sse2 (s, a);
|
||||
|
||||
if (index != 0)
|
||||
{
|
||||
/* Combine mask0 and mask1. */
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 1);
|
||||
break;
|
||||
case 2:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 2);
|
||||
break;
|
||||
case 3:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 3);
|
||||
break;
|
||||
case 4:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 4);
|
||||
break;
|
||||
case 5:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 5);
|
||||
break;
|
||||
case 6:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 6);
|
||||
break;
|
||||
case 7:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 7);
|
||||
break;
|
||||
case 8:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 8);
|
||||
break;
|
||||
case 9:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 9);
|
||||
break;
|
||||
case 10:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 10);
|
||||
break;
|
||||
case 11:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 11);
|
||||
break;
|
||||
case 12:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 12);
|
||||
break;
|
||||
case 13:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 13);
|
||||
break;
|
||||
case 14:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 14);
|
||||
break;
|
||||
case 15:
|
||||
mask = _mm_alignr_epi8 (mask1, mask0, 15);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
/* A is aligned. */
|
||||
mask = _mm_load_si128 ((__m128i *) a);
|
||||
|
||||
/* Find where the NULL terminator is. */
|
||||
length = _mm_cmpistri (mask, mask, 0x3a);
|
||||
if (length == 16)
|
||||
{
|
||||
/* There is no NULL terminator. Don't use SSE4.2 if the length
|
||||
of A > 16. */
|
||||
if (a[16] != 0)
|
||||
return __strspn_sse2 (s, a);
|
||||
}
|
||||
}
|
||||
|
||||
offset = (int) ((size_t) s & 15);
|
||||
if (offset != 0)
|
||||
{
|
||||
/* Check partial string. */
|
||||
aligned = (const char *) ((size_t) s & 0xfffffffffffffff0L);
|
||||
value = _mm_load_si128 ((__m128i *) aligned);
|
||||
|
||||
switch (offset)
|
||||
{
|
||||
case 1:
|
||||
value = _mm_srli_si128 (value, 1);
|
||||
break;
|
||||
case 2:
|
||||
value = _mm_srli_si128 (value, 2);
|
||||
break;
|
||||
case 3:
|
||||
value = _mm_srli_si128 (value, 3);
|
||||
break;
|
||||
case 4:
|
||||
value = _mm_srli_si128 (value, 4);
|
||||
break;
|
||||
case 5:
|
||||
value = _mm_srli_si128 (value, 5);
|
||||
break;
|
||||
case 6:
|
||||
value = _mm_srli_si128 (value, 6);
|
||||
break;
|
||||
case 7:
|
||||
value = _mm_srli_si128 (value, 7);
|
||||
break;
|
||||
case 8:
|
||||
value = _mm_srli_si128 (value, 8);
|
||||
break;
|
||||
case 9:
|
||||
value = _mm_srli_si128 (value, 9);
|
||||
break;
|
||||
case 10:
|
||||
value = _mm_srli_si128 (value, 10);
|
||||
break;
|
||||
case 11:
|
||||
value = _mm_srli_si128 (value, 11);
|
||||
break;
|
||||
case 12:
|
||||
value = _mm_srli_si128 (value, 12);
|
||||
break;
|
||||
case 13:
|
||||
value = _mm_srli_si128 (value, 13);
|
||||
break;
|
||||
case 14:
|
||||
value = _mm_srli_si128 (value, 14);
|
||||
break;
|
||||
case 15:
|
||||
value = _mm_srli_si128 (value, 15);
|
||||
break;
|
||||
}
|
||||
|
||||
length = _mm_cmpistri (mask, value, 0x12);
|
||||
/* No need to check CFlag since it is always 1. */
|
||||
if (length < 16 - offset)
|
||||
return length;
|
||||
/* Find where the NULL terminator is. */
|
||||
index = _mm_cmpistri (value, value, 0x3a);
|
||||
if (index < 16 - offset)
|
||||
return length;
|
||||
aligned += 16;
|
||||
}
|
||||
else
|
||||
aligned = s;
|
||||
|
||||
loop:
|
||||
value = _mm_load_si128 ((__m128i *) aligned);
|
||||
index = _mm_cmpistri (mask, value, 0x12);
|
||||
cflag = _mm_cmpistrc (mask, value, 0x12);
|
||||
if (cflag)
|
||||
return (size_t) (aligned + index - s);
|
||||
aligned += 16;
|
||||
goto loop;
|
||||
}
|
63
sysdeps/x86_64/multiarch/strspn.S
Normal file
63
sysdeps/x86_64/multiarch/strspn.S
Normal file
@ -0,0 +1,63 @@
|
||||
/* Multiple versions of strspn
|
||||
Copyright (C) 2009 Free Software Foundation, Inc.
|
||||
Contributed by Intel Corporation.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 2.1 of the License, or (at your option) any later version.
|
||||
|
||||
The GNU C Library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with the GNU C Library; if not, write to the Free
|
||||
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||
02111-1307 USA. */
|
||||
|
||||
#include <config.h>
|
||||
|
||||
#ifdef HAVE_SSE4_SUPPORT
|
||||
|
||||
#include <sysdep.h>
|
||||
#include <ifunc-defines.h>
|
||||
|
||||
/* Define multiple versions only for the definition in libc. */
|
||||
#ifndef NOT_IN_libc
|
||||
.text
|
||||
ENTRY(strspn)
|
||||
.type strspn, @gnu_indirect_function
|
||||
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
|
||||
jne 1f
|
||||
call __init_cpu_features
|
||||
1: leaq __strspn_sse2(%rip), %rax
|
||||
testl $(1<<20), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
|
||||
jz 2f
|
||||
leaq __strspn_sse42(%rip), %rax
|
||||
2: ret
|
||||
END(strspn)
|
||||
|
||||
# undef ENTRY
|
||||
# define ENTRY(name) \
|
||||
.type __strspn_sse2, @function; \
|
||||
.globl __strspn_sse2; \
|
||||
.align 16; \
|
||||
__strspn_sse2: cfi_startproc; \
|
||||
CALL_MCOUNT
|
||||
# undef END
|
||||
# define END(name) \
|
||||
cfi_endproc; .size __strspn_sse2, .-__strspn_sse2
|
||||
# undef libc_hidden_builtin_def
|
||||
/* It doesn't make sense to send libc-internal strspn calls through a PLT.
|
||||
The speedup we get from using SSE4.2 instruction is likely eaten away
|
||||
by the indirect call in the PLT. */
|
||||
# define libc_hidden_builtin_def(name) \
|
||||
.globl __GI_strspn; __GI_strspn = __strspn_sse2
|
||||
#endif
|
||||
|
||||
#endif /* HAVE_SSE4_SUPPORT */
|
||||
|
||||
#include "../strspn.S"
|
Loading…
x
Reference in New Issue
Block a user