388 lines
12 KiB
C
388 lines
12 KiB
C
/* strstr with SSE4.2 intrinsics
|
|
Copyright (C) 2009, 2010 Free Software Foundation, Inc.
|
|
Contributed by Intel Corporation.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, write to the Free
|
|
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
|
02111-1307 USA. */
|
|
|
|
#include <nmmintrin.h>
|
|
#include "varshift.h"
|
|
|
|
#ifndef STRSTR_SSE42
|
|
# define STRSTR_SSE42 __strstr_sse42
|
|
#endif
|
|
|
|
#ifdef USE_AS_STRCASESTR
|
|
# include <ctype.h>
|
|
# include <locale/localeinfo.h>
|
|
|
|
# define LOADBYTE(C) tolower (C)
|
|
# define CMPBYTE(C1, C2) (tolower (C1) == tolower (C2))
|
|
#else
|
|
# define LOADBYTE(C) (C)
|
|
# define CMPBYTE(C1, C2) ((C1) == (C2))
|
|
#endif
|
|
|
|
/* We use 0xe ordered-compare:
|
|
_SIDD_SBYTE_OPS
|
|
| _SIDD_CMP_EQUAL_ORDER
|
|
| _SIDD_LEAST_SIGNIFICANT
|
|
on pcmpistri to do the scanning and string comparsion requirements of
|
|
sub-string match. In the scanning phase, we process Cflag and ECX
|
|
index to locate the first fragment match; once the first fragment
|
|
match position has been identified, we do comparison of subsequent
|
|
string fragments until we can conclude false or true match; whe
|
|
n concluding a false match, we may need to repeat scanning process
|
|
from next relevant offset in the target string.
|
|
|
|
In the scanning phase we have 4 cases:
|
|
case ECX CFlag ZFlag SFlag
|
|
1 16 0 0 0
|
|
2a 16 0 0 1
|
|
2b 16 0 1 0
|
|
2c 16 0 1 1
|
|
|
|
1. No ordered-comparison match, both 16B fragments are valid, so
|
|
continue to next fragment.
|
|
2. No ordered-comparison match, there is EOS in either fragment,
|
|
2a. Zflg = 0, Sflg = 1, we continue
|
|
2b. Zflg = 1, Sflg = 0, we conclude no match and return.
|
|
2c. Zflg = 1, sflg = 1, lenth determine match or no match
|
|
|
|
In the string comparison phase, the 1st fragment match is fixed up
|
|
to produce ECX = 0. Subsequent fragment compare of nonzero index
|
|
and no match conclude a false match.
|
|
|
|
case ECX CFlag ZFlag SFlag
|
|
3 X 1 0 0/1
|
|
4a 0 1 0 0
|
|
4b 0 1 0 1
|
|
4c 0 < X 1 0 0/1
|
|
5 16 0 1 0
|
|
|
|
3. An initial ordered-comparison fragment match, we fix up to do
|
|
subsequent string comparison
|
|
4a. Continuation of fragment comparison of a string compare.
|
|
4b. EOS reached in the reference string, we conclude true match and
|
|
return
|
|
4c. String compare failed if index is nonzero, we need to go back to
|
|
scanning
|
|
5. failed string compare, go back to scanning
|
|
*/
|
|
|
|
/* Simple replacement of movdqu to address 4KB boundary cross issue.
|
|
If EOS occurs within less than 16B before 4KB boundary, we don't
|
|
cross to next page. */
|
|
|
|
static inline __m128i
|
|
__m128i_strloadu (const unsigned char * p)
|
|
{
|
|
int offset = ((size_t) p & (16 - 1));
|
|
|
|
if (offset && (int) ((size_t) p & 0xfff) > 0xff0)
|
|
{
|
|
__m128i a = _mm_load_si128 ((__m128i *) (p - offset));
|
|
__m128i zero = _mm_setzero_si128 ();
|
|
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (a, zero));
|
|
if ((bmsk >> offset) != 0)
|
|
return __m128i_shift_right (a, offset);
|
|
}
|
|
return _mm_loadu_si128 ((__m128i *) p);
|
|
}
|
|
|
|
#if defined USE_AS_STRCASESTR && !defined STRCASESTR_NONASCII
|
|
|
|
/* Similar to __m128i_strloadu. Convert to lower case for POSIX/C
|
|
locale. */
|
|
static inline __m128i
|
|
__m128i_strloadu_tolower (const unsigned char *p, __m128i rangeuc,
|
|
__m128i u2ldelta)
|
|
{
|
|
__m128i frag = __m128i_strloadu (p);
|
|
|
|
#define UCLOW 0x4040404040404040ULL
|
|
#define UCHIGH 0x5b5b5b5b5b5b5b5bULL
|
|
#define LCQWORD 0x2020202020202020ULL
|
|
/* Compare if 'Z' > bytes. Inverted way to get a mask for byte <= 'Z'. */
|
|
__m128i r2 = _mm_cmpgt_epi8 (_mm_set1_epi64x (UCHIGH), frag);
|
|
/* Compare if bytes are > 'A' - 1. */
|
|
__m128i r1 = _mm_cmpgt_epi8 (frag, _mm_set1_epi64x (UCLOW));
|
|
/* Mask byte == ff if byte(r2) <= 'Z' and byte(r1) > 'A' - 1. */
|
|
__m128i mask = _mm_and_si128 (r2, r1);
|
|
/* Apply lowercase bit 6 mask for above mask bytes == ff. */
|
|
return _mm_or_si128 (frag, _mm_and_si128 (mask, _mm_set1_epi64x (LCQWORD)));
|
|
}
|
|
|
|
#endif
|
|
|
|
/* Calculate Knuth-Morris-Pratt string searching algorithm (or KMP
|
|
algorithm) overlap for a fully populated 16B vector.
|
|
Input parameter: 1st 16Byte loaded from the reference string of a
|
|
strstr function.
|
|
We don't use KMP algorithm if reference string is less than 16B. */
|
|
static int
|
|
__inline__ __attribute__ ((__always_inline__,))
|
|
KMP16Bovrlap (__m128i s2)
|
|
{
|
|
__m128i b = _mm_unpacklo_epi8 (s2, s2);
|
|
__m128i a = _mm_unpacklo_epi8 (b, b);
|
|
a = _mm_shuffle_epi32 (a, 0);
|
|
b = _mm_srli_si128 (s2, sizeof (char));
|
|
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (b, a));
|
|
|
|
/* _BitScanForward(&k1, bmsk); */
|
|
int k1;
|
|
__asm ("bsfl %[bmsk], %[k1]" : [k1] "=r" (k1) : [bmsk] "r" (bmsk));
|
|
if (!bmsk)
|
|
return 16;
|
|
else if (bmsk == 0x7fff)
|
|
return 1;
|
|
else if (!k1)
|
|
{
|
|
/* There are al least two distinct chars in s2. If byte 0 and 1 are
|
|
idential and the distinct value lies farther down, we can deduce
|
|
the next byte offset to restart full compare is least no earlier
|
|
than byte 3. */
|
|
return 3;
|
|
}
|
|
else
|
|
{
|
|
/* Byte 1 is not degenerated to byte 0. */
|
|
return k1 + 1;
|
|
}
|
|
}
|
|
|
|
char *
|
|
__attribute__ ((section (".text.sse4.2")))
|
|
STRSTR_SSE42 (const unsigned char *s1, const unsigned char *s2)
|
|
{
|
|
#define p1 s1
|
|
const unsigned char *p2 = s2;
|
|
|
|
#ifndef STRCASESTR_NONASCII
|
|
if (__builtin_expect (p2[0] == '\0', 0))
|
|
return (char *) p1;
|
|
|
|
if (__builtin_expect (p1[0] == '\0', 0))
|
|
return NULL;
|
|
|
|
/* Check if p1 length is 1 byte long. */
|
|
if (__builtin_expect (p1[1] == '\0', 0))
|
|
return p2[1] == '\0' && CMPBYTE (p1[0], p2[0]) ? (char *) p1 : NULL;
|
|
#endif
|
|
|
|
#ifdef USE_AS_STRCASESTR
|
|
# ifndef STRCASESTR_NONASCII
|
|
if (__builtin_expect (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_NONASCII_CASE)
|
|
!= 0, 0))
|
|
return __strcasestr_sse42_nonascii (s1, s2);
|
|
|
|
const __m128i rangeuc = _mm_set_epi64x (0x0, 0x5a41);
|
|
const __m128i u2ldelta = _mm_set1_epi64x (0xe0e0e0e0e0e0e0e0);
|
|
# define strloadu(p) __m128i_strloadu_tolower (p, rangeuc, u2ldelta)
|
|
# else
|
|
# define strloadu __m128i_strloadu_tolower
|
|
# endif
|
|
#else
|
|
# define strloadu __m128i_strloadu
|
|
#endif
|
|
|
|
/* p1 > 1 byte long. Load up to 16 bytes of fragment. */
|
|
__m128i frag1 = strloadu (p1);
|
|
|
|
__m128i frag2;
|
|
if (p2[1] != '\0')
|
|
/* p2 is > 1 byte long. */
|
|
frag2 = strloadu (p2);
|
|
else
|
|
frag2 = _mm_insert_epi8 (_mm_setzero_si128 (), LOADBYTE (p2[0]), 0);
|
|
|
|
/* Unsigned bytes, equal order, does frag2 has null? */
|
|
int cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
int cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
int cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
int cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
|
if (cmp_s & cmp_c)
|
|
{
|
|
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (frag2,
|
|
_mm_setzero_si128 ()));
|
|
int len;
|
|
__asm ("bsfl %[bmsk], %[len]"
|
|
: [len] "=r" (len) : [bmsk] "r" (bmsk));
|
|
p1 += cmp;
|
|
if ((len + cmp) <= 16)
|
|
return (char *) p1;
|
|
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag1 = strloadu (p1);
|
|
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
|
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
if ((len + cmp) <= 16)
|
|
return (char *) p1 + cmp;
|
|
}
|
|
|
|
if (cmp_s)
|
|
{
|
|
/* Adjust addr for 16B alginment in ensuing loop. */
|
|
while (!cmp_z)
|
|
{
|
|
p1 += cmp;
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag1 = strloadu (p1);
|
|
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
/* Because s2 < 16 bytes and we adjusted p1 by non-zero cmp
|
|
once already, this time cmp will be zero and we can exit. */
|
|
if ((!cmp) & cmp_c)
|
|
break;
|
|
}
|
|
|
|
if (!cmp_c)
|
|
return NULL;
|
|
|
|
/* Since s2 is less than 16 bytes, com_c is definitive
|
|
determination of full match. */
|
|
return (char *) p1 + cmp;
|
|
}
|
|
|
|
/* General case, s2 is at least 16 bytes or more.
|
|
First, the common case of false-match at first byte of p2. */
|
|
const unsigned char *pt = NULL;
|
|
int kmp_fwd = 0;
|
|
re_trace:
|
|
while (!cmp_c)
|
|
{
|
|
/* frag1 has null. */
|
|
if (cmp_z)
|
|
return NULL;
|
|
|
|
/* frag 1 has no null, advance 16 bytes. */
|
|
p1 += 16;
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag1 = strloadu (p1);
|
|
/* Unsigned bytes, equal order, is there a partial match? */
|
|
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
}
|
|
|
|
/* Next, handle initial positive match as first byte of p2. We have
|
|
a partial fragment match, make full determination until we reached
|
|
end of s2. */
|
|
if (!cmp)
|
|
{
|
|
if (cmp_z)
|
|
return (char *) p1;
|
|
|
|
pt = p1;
|
|
p1 += 16;
|
|
p2 += 16;
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag2 = strloadu (p2);
|
|
}
|
|
else
|
|
{
|
|
/* Adjust 16B alignment. */
|
|
p1 += cmp;
|
|
pt = p1;
|
|
}
|
|
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag1 = strloadu (p1);
|
|
|
|
/* Unsigned bytes, equal order, does frag2 has null? */
|
|
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
|
while (!(cmp | cmp_z | cmp_s))
|
|
{
|
|
p1 += 16;
|
|
p2 += 16;
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag2 = strloadu (p2);
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag1 = strloadu (p1);
|
|
/* Unsigned bytes, equal order, does frag2 has null? */
|
|
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
cmp_s = _mm_cmpistrs (frag2, frag1, 0x0c);
|
|
}
|
|
|
|
/* Full determination yielded a false result, retrace s1 to next
|
|
starting position.
|
|
Zflg 1 0 1 0/1
|
|
Sflg 0 1 1 0/1
|
|
cmp na 0 0 >0
|
|
action done done continue continue if s2 < s1
|
|
false match retrace s1 else false
|
|
*/
|
|
|
|
if (cmp_s & !cmp)
|
|
return (char *) pt;
|
|
if (cmp_z)
|
|
{
|
|
if (!cmp_s)
|
|
return NULL;
|
|
|
|
/* Handle both zero and sign flag set and s1 is shorter in
|
|
length. */
|
|
__m128i zero = _mm_setzero_si128 ();
|
|
int bmsk = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag2));
|
|
int bmsk1 = _mm_movemask_epi8 (_mm_cmpeq_epi8 (zero, frag1));
|
|
int len;
|
|
int len1;
|
|
__asm ("bsfl %[bmsk], %[len]"
|
|
: [len] "=r" (len) : [bmsk] "r" (bmsk));
|
|
__asm ("bsfl %[bmsk1], %[len1]"
|
|
: [len1] "=r" (len1) : [bmsk1] "r" (bmsk1));
|
|
if (len >= len1)
|
|
return NULL;
|
|
}
|
|
else if (!cmp)
|
|
return (char *) pt;
|
|
|
|
/* Otherwise, we have to retrace and continue. Default of multiple
|
|
paths that need to retrace from next byte in s1. */
|
|
p2 = s2;
|
|
frag2 = strloadu (p2);
|
|
|
|
if (!kmp_fwd)
|
|
kmp_fwd = KMP16Bovrlap (frag2);
|
|
|
|
/* KMP algorithm predicted overlap needs to be corrected for
|
|
partial fragment compare. */
|
|
p1 = pt + (kmp_fwd > cmp ? cmp : kmp_fwd);
|
|
|
|
/* Since s2 is at least 16 bytes long, we're certain there is no
|
|
match. */
|
|
if (p1[0] == '\0')
|
|
return NULL;
|
|
|
|
/* Load up to 16 bytes of fragment. */
|
|
frag1 = strloadu (p1);
|
|
|
|
/* Unsigned bytes, equal order, is there a partial match? */
|
|
cmp_c = _mm_cmpistrc (frag2, frag1, 0x0c);
|
|
cmp = _mm_cmpistri (frag2, frag1, 0x0c);
|
|
cmp_z = _mm_cmpistrz (frag2, frag1, 0x0c);
|
|
goto re_trace;
|
|
}
|