Ulrich Drepper d6485c981b Align functions to 16-byte boundary.
Some of the new multi-arch string functions for x86-64 were
not aligned to 16 byte boundarie,s possibly creating unnecessary
cache line misses and delays.
2009-07-03 03:01:57 -07:00

1919 lines
39 KiB
ArmAsm

/* strcpy with SSSE3
Copyright (C) 2009 Free Software Foundation, Inc.
Contributed by Intel Corporation.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
version 2.1 of the License, or (at your option) any later version.
The GNU C Library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public
License along with the GNU C Library; if not, write to the Free
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
02111-1307 USA. */
#include <sysdep.h>
#include <ifunc-defines.h>
#if !defined (USE_AS_STPCPY) && !defined (USE_AS_STRNCPY)
# ifndef STRCPY
# define STRCPY strcpy
# endif
#endif
#ifdef USE_AS_STPCPY
# ifdef USE_AS_STRNCPY
# define STRCPY_SSSE3 __stpncpy_ssse3
# define STRCPY_SSE2 __stpncpy_sse2
# define __GI_STRCPY __GI_stpncpy
# else
# define STRCPY_SSSE3 __stpcpy_ssse3
# define STRCPY_SSE2 __stpcpy_sse2
# define __GI_STRCPY __GI_stpcpy
# define __GI___STRCPY __GI___stpcpy
# endif
#else
# ifdef USE_AS_STRNCPY
# define STRCPY_SSSE3 __strncpy_ssse3
# define STRCPY_SSE2 __strncpy_sse2
# define __GI_STRCPY __GI_strncpy
# else
# define STRCPY_SSSE3 __strcpy_ssse3
# define STRCPY_SSE2 __strcpy_sse2
# define __GI_STRCPY __GI_strcpy
# endif
#endif
#ifndef LABEL
#define LABEL(l) L(l)
#endif
/* Define multiple versions only for the definition in libc. */
#ifndef NOT_IN_libc
.text
ENTRY(STRCPY)
.type STRCPY, @gnu_indirect_function
cmpl $0, __cpu_features+KIND_OFFSET(%rip)
jne 1f
call __init_cpu_features
1: leaq STRCPY_SSE2(%rip), %rax
testl $(1<<9), __cpu_features+CPUID_OFFSET+COMMON_CPUID_INDEX_1*CPUID_SIZE+CPUID_ECX_OFFSET(%rip)
jz 3f
/* Avoid SSSE3 strcpy on Atom since it is slow. */
cmpl $1, __cpu_features+KIND_OFFSET(%rip)
jne 2f
cmpl $6, __cpu_features+FAMILY_OFFSET(%rip)
jne 2f
cmpl $28, __cpu_features+MODEL_OFFSET(%rip)
jz 3f
2: leaq STRCPY_SSSE3(%rip), %rax
3: ret
END(STRCPY)
.section .text.ssse3,"ax",@progbits
STRCPY_SSSE3:
cfi_startproc
CALL_MCOUNT
/*
* This implementation uses SSE to copy up to 16 bytes at a time.
*/
#ifdef USE_AS_STRNCPY
test %rdx, %rdx
jz LABEL(strncpy_exitz)
mov %rdx, %r8
#else
xor %edx, %edx
#endif
mov %esi, %ecx
and $0xfffffffffffffff0, %rsi /*force rsi 16 byte align*/
and $15, %ecx
mov %rdi, %rax /*store return parameter*/
pxor %xmm0, %xmm0 /* clear %xmm0 */
pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
shr %cl, %edx /* get real bits left in edx*/
test %edx, %edx /* edx must be 0 if there is no null char from rsi+%rcx */
jnz LABEL(less16bytes)
#ifdef USE_AS_STRNCPY
lea -16(%r8,%rcx), %r11
cmp $0, %r11
jle LABEL(less16bytes) /* if r8 + rcx <= 16, branch to less16bytes. */
#endif
mov %rcx, %r9
or %edi, %ecx
and $15, %ecx
lea -16(%r9), %r10
jz LABEL(ashr_0) /* ecx must be 0 if offset of rsi and rdi is 16 byte align*/
neg %r10 /* store the rest in rsi aligned 16 bytes for unaligned_exit*/
pxor %xmm0, %xmm0 /* clear %xmm0, may be polluted by unaligned operation*/
pcmpeqb 16(%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char*/
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(less32bytes)
/*
* at least 16 byte available to fill destination rdi
*/
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(less32bytes_strncpy_truncation)
#endif
mov (%rsi, %r9), %rdx
mov %rdx, (%rdi)
mov 8(%rsi, %r9), %rdx
mov %rdx, 8(%rdi)
/*
* so far destatination rdi may be aligned by 16, re-calculate rsi to jump
* crossponding case
* rcx is offset of rsi
* rax is offset of rdi
*/
and $0xfffffffffffffff0, %rdi /* force rdi 16 byte align */
mov %rax, %rdx /* rax store orignal rdi */
xor %rdi, %rdx /* equal to and $15, %rdx */
#ifdef USE_AS_STRNCPY
add %rdx, %r8
#endif
add $16, %rdi /* next 16 bytes for rdi */
sub %rdx, %r9
lea 16(%r9, %rsi), %rsi /*re-calculate rsi by (16 - rdx)+ rcx */
mov %esi, %ecx /*store offset of rsi */
and $0xfffffffffffffff0, %rsi /* force rsi 16 byte align */
and $15, %ecx /* ecx must be 0 if rdx is equal to rcx*/
jz LABEL(ashr_0)
lea -16(%rcx), %r10
mov %rcx, %r9
neg %r10
lea LABEL(unaligned_table)(%rip), %r11
movslq (%r11, %rcx,4), %rcx
lea (%r11, %rcx), %rcx
jmp *%rcx
/*
* The following cases will be handled by ashr_0 & ashr_0_start
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* 0 0 0 ashr_0
* n(1~15) n(1~15) 0 ashr_0_start
*
*/
.p2align 5
LABEL(ashr_0):
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_aligned)
#endif
movdqa (%rsi), %xmm1 /* fetch first 16 bytes from rsi */
movdqa %xmm1, (%rdi) /* store first 16 bytes into rdi */
add $16, %rsi
add $16, %rdi
pcmpeqb (%rsi), %xmm0 /* compare 16 bytes in (%rsi) and %xmm0 for equality, try to find null char */
pmovmskb %xmm0, %edx /* move each byte mask of %xmm0 to edx*/
test %edx, %edx /* edx must be 0 if there is no null char in rsi*/
jnz LABEL(aligned_16bytes)
LABEL(ashr_0_loop):
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_aligned)
#endif
movdqa (%rsi, %rcx), %xmm1
movdqa %xmm1, (%rdi, %rcx)
add $16, %rcx
pcmpeqb (%rsi, %rcx), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(aligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_aligned)
#endif
movdqa (%rsi, %rcx), %xmm1
movdqa %xmm1, (%rdi, %rcx)
add $16, %rcx
pcmpeqb (%rsi, %rcx), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(aligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_aligned)
#endif
movdqa (%rsi, %rcx), %xmm1
movdqa %xmm1, (%rdi, %rcx)
add $16, %rcx
pcmpeqb (%rsi, %rcx), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(aligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_aligned)
#endif
movdqa (%rsi, %rcx), %xmm1
movdqa %xmm1, (%rdi, %rcx)
add $16, %rcx
pcmpeqb (%rsi, %rcx), %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jz LABEL(ashr_0_loop)
jmp LABEL(aligned_exit)
.p2align 4
/*
* The following cases will be handled by ashr_15
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(15) n - 15 15((16 - (n -15) + n)%16 ashr_15
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_15):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_15_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $15, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $15, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_15_use_ssse3)
/*
* The following cases will be handled by ashr_14
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(14~15) n - 14 14((16 - (n -14) + n)%16 ashr_14
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_14):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_14_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $14, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $14, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_14_use_ssse3)
/*
* The following cases will be handled by ashr_13
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(13~15) n - 13 13((16 - (n -13) + n)%16 ashr_13
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_13):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_13_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $13, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $13, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_13_use_ssse3)
/*
* The following cases will be handled by ashr_12
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(12~15) n - 12 12((16 - (n -12) + n)%16 ashr_12
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_12):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_12_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $12, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $12, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_12_use_ssse3)
/*
* The following cases will be handled by ashr_11
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(11~15) n - 11 11((16 - (n -11) + n)%16 ashr_11
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_11):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_11_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $11, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $11, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_11_use_ssse3)
/*
* The following cases will be handled by ashr_10
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(10~15) n - 10 10((16 - (n -10) + n)%16 ashr_10
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_10):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_10_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $10, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $10, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_10_use_ssse3)
/*
* The following cases will be handled by ashr_9
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(9~15) n - 9 9((16 - (n -9) + n)%16 ashr_9
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_9):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_9_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $9, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $9, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_9_use_ssse3)
/*
* The following cases will be handled by ashr_8
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(8~15) n - 8 8((16 - (n -8) + n)%16 ashr_8
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_8):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_8_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $8, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $8, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_8_use_ssse3)
/*
* The following cases will be handled by ashr_7
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(7~15) n - 7 7((16 - (n -7) + n)%16 ashr_7
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_7):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_7_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $7, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $7, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_7_use_ssse3)
/*
* The following cases will be handled by ashr_6
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(6~15) n - 6 6((16 - (n -6) + n)%16 ashr_6
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_6):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_6_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $6, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $6, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_6_use_ssse3)
/*
* The following cases will be handled by ashr_5
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(5~15) n - 5 5((16 - (n -5) + n)%16 ashr_5
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_5):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_5_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $5, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $5, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_5_use_ssse3)
/*
*
* The following cases will be handled by ashr_4
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(4~15) n - 4 4((16 - (n -4) + n)%16 ashr_4
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_4):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_4_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $4, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $4, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_4_use_ssse3)
/*
*
* The following cases will be handled by ashr_3
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(3~15) n - 3 3((16 - (n -3) + n)%16 ashr_3
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_3):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_3_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $3, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $3, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_3_use_ssse3)
/*
*
* The following cases will be handled by ashr_2
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(2~15) n - 2 2((16 - (n -2) + n)%16 ashr_2
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_2):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_2_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $2, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $2, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_2_use_ssse3)
/*
*
* The following cases will be handled by ashr_1
* rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case
* n(1~15) n - 1 1 ((16 - (n -1) + n)%16 ashr_1
*
* Based on above operation , start from (%r9 + rsi) to the left of this cache bank, there is no null byte
*/
.p2align 4
LABEL(ashr_1):
xor %ecx, %ecx /*clear ecx */
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
.p2align 4
LABEL(ashr_1_use_ssse3):
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $1, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
movdqa 16(%rsi, %rcx), %xmm3
pcmpeqb %xmm3, %xmm0
pmovmskb %xmm0, %edx
test %edx, %edx
jnz LABEL(unaligned_exit)
#ifdef USE_AS_STRNCPY
sub $16, %r8
jbe LABEL(strncpy_truncation_unaligned)
#endif
palignr $1, (%rsi, %rcx), %xmm3
movdqa %xmm3, (%rdi, %rcx)
add $16, %rcx
#ifdef USE_AS_STRNCPY
cmp %r10, %r8
jbe LABEL(unaligned_exit)
#endif
jmp LABEL(ashr_1_use_ssse3)
.p2align 4
LABEL(less32bytes):
xor %ecx, %ecx
LABEL(unaligned_exit):
add %r9, %rsi /* r9 stores original offset of rsi*/
mov %rcx, %r9
mov %r10, %rcx
shl %cl, %edx /* after shl, calculate the exact number to be filled*/
mov %r9, %rcx
.p2align 4
LABEL(aligned_exit):
add %rcx, %rdi /*locate exact address for rdi */
LABEL(less16bytes):
add %rcx, %rsi /*locate exact address for rsi */
LABEL(aligned_16bytes):
#ifdef USE_AS_STRNCPY
mov $1, %r9d
lea -1(%r8), %rcx
shl %cl, %r9d
cmp $32, %r8
ja LABEL(strncpy_tail)
or %r9d, %edx
LABEL(strncpy_tail):
#endif
bsf %rdx, %rcx /*If a least significant 1 bit in %rdx is found, its bit index is stored in %rcx*/
lea LABEL(tail_table)(%rip), %r11
movslq (%r11, %rcx,4), %rcx
lea (%r11, %rcx), %rcx
jmp *%rcx
#ifdef USE_AS_STRNCPY
.p2align 4
LABEL(less32bytes_strncpy_truncation):
xor %ecx, %ecx
LABEL(strncpy_truncation_unaligned):
add %r9, %rsi
LABEL(strncpy_truncation_aligned):
add %rcx, %rdi
add %rcx, %rsi
add $16, %r8
lea -1(%r8), %rcx
lea LABEL(tail_table)(%rip), %r11
movslq (%r11, %rcx,4), %rcx
lea (%r11, %rcx), %rcx
jmp *%rcx
.p2align 4
LABEL(strncpy_exitz):
mov %rdi, %rax
ret
#endif
#ifdef USE_AS_STRNCPY
.p2align 4
LABEL(strncpy_fill_tail):
mov %rax, %rdx
movzx %cl, %rax
mov %r8, %rcx
add %rax, %rdi
xor %eax, %eax
shr $3, %ecx
jz LABEL(strncpy_fill_less_8)
rep stosq
LABEL(strncpy_fill_less_8):
mov %r8, %rcx
and $7, %ecx
jz LABEL(strncpy_fill_return)
LABEL(strncpy_fill_less_7):
sub $1, %ecx
mov %al, (%rdi, %rcx)
jnz LABEL(strncpy_fill_less_7)
LABEL(strncpy_fill_return):
#ifdef USE_AS_STPCPY
cmpb $1, (%rdx)
sbb $-1, %rdx
#endif
mov %rdx, %rax
ret
#endif
.p2align 4
LABEL(tail_0):
mov (%rsi), %cl
mov %cl, (%rdi)
#ifdef USE_AS_STPCPY
mov %rdi, %rax
#endif
#ifdef USE_AS_STRNCPY
mov $1, %cl
sub $1, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_1):
mov (%rsi), %cx
mov %cx, (%rdi)
#ifdef USE_AS_STPCPY
lea 1(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $2, %cl
sub $2, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_2):
mov (%rsi), %cx
mov %cx, (%rdi)
mov 1(%rsi), %cx
mov %cx, 1(%rdi)
#ifdef USE_AS_STPCPY
lea 2(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $3, %cl
sub $3, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_3):
mov (%rsi), %ecx
mov %ecx, (%rdi)
#ifdef USE_AS_STPCPY
lea 3(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $4, %cl
sub $4, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_4):
mov (%rsi), %ecx
mov %ecx, (%rdi)
mov 1(%rsi), %edx
mov %edx, 1(%rdi)
#ifdef USE_AS_STPCPY
lea 4(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $5, %cl
sub $5, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_5):
mov (%rsi), %ecx
mov %ecx, (%rdi)
mov 2(%rsi), %edx
mov %edx, 2(%rdi)
#ifdef USE_AS_STPCPY
lea 5(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $6, %cl
sub $6, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_6):
mov (%rsi), %ecx
mov %ecx, (%rdi)
mov 3(%rsi), %edx
mov %edx,3(%rdi)
#ifdef USE_AS_STPCPY
lea 6(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $7, %cl
sub $7, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_7):
mov (%rsi), %rcx
mov %rcx, (%rdi)
#ifdef USE_AS_STPCPY
lea 7(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $8, %cl
sub $8, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_8):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 5(%rsi), %edx
mov %edx, 5(%rdi)
#ifdef USE_AS_STPCPY
lea 8(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $9, %cl
sub $9, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_9):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 6(%rsi), %edx
mov %edx, 6(%rdi)
#ifdef USE_AS_STPCPY
lea 9(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $10, %cl
sub $10, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_10):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 7(%rsi), %edx
mov %edx, 7(%rdi)
#ifdef USE_AS_STPCPY
lea 10(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $11, %cl
sub $11, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_11):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %edx
mov %edx, 8(%rdi)
#ifdef USE_AS_STPCPY
lea 11(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $12, %cl
sub $12, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_12):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 5(%rsi), %rcx
mov %rcx, 5(%rdi)
#ifdef USE_AS_STPCPY
lea 12(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $13, %cl
sub $13, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_13):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 6(%rsi), %rcx
mov %rcx, 6(%rdi)
#ifdef USE_AS_STPCPY
lea 13(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $14, %cl
sub $14, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_14):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 7(%rsi), %rcx
mov %rcx, 7(%rdi)
#ifdef USE_AS_STPCPY
lea 14(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $15, %cl
sub $15, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
LABEL(tail_15):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
#ifdef USE_AS_STPCPY
lea 15(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $16, %cl
sub $16, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_16):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %cl
mov %cl, 16(%rdi)
#ifdef USE_AS_STPCPY
lea 16(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $17, %cl
sub $17, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_17):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %cx
mov %cx, 16(%rdi)
#ifdef USE_AS_STPCPY
lea 17(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $18, %cl
sub $18, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_18):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 15(%rsi), %ecx
mov %ecx,15(%rdi)
#ifdef USE_AS_STPCPY
lea 18(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $19, %cl
sub $19, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_19):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %ecx
mov %ecx, 16(%rdi)
#ifdef USE_AS_STPCPY
lea 19(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $20, %cl
sub $20, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_20):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 13(%rsi), %rcx
mov %rcx, 13(%rdi)
#ifdef USE_AS_STPCPY
lea 20(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $21, %cl
sub $21, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_21):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 14(%rsi), %rcx
mov %rcx, 14(%rdi)
#ifdef USE_AS_STPCPY
lea 21(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $22, %cl
sub $22, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_22):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 15(%rsi), %rcx
mov %rcx, 15(%rdi)
#ifdef USE_AS_STPCPY
lea 22(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $23, %cl
sub $23, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_23):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
#ifdef USE_AS_STPCPY
lea 23(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $24, %cl
sub $24, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_24):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 21(%rsi), %edx
mov %edx, 21(%rdi)
#ifdef USE_AS_STPCPY
lea 24(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $25, %cl
sub $25, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_25):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 22(%rsi), %edx
mov %edx, 22(%rdi)
#ifdef USE_AS_STPCPY
lea 25(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $26, %cl
sub $26, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_26):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 23(%rsi), %edx
mov %edx, 23(%rdi)
#ifdef USE_AS_STPCPY
lea 26(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $27, %cl
sub $27, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_27):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 24(%rsi), %edx
mov %edx, 24(%rdi)
#ifdef USE_AS_STPCPY
lea 27(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $28, %cl
sub $28, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_28):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 21(%rsi), %rdx
mov %rdx, 21(%rdi)
#ifdef USE_AS_STPCPY
lea 28(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $29, %cl
sub $29, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_29):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 22(%rsi), %rdx
mov %rdx, 22(%rdi)
#ifdef USE_AS_STPCPY
lea 29(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $30, %cl
sub $30, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_30):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 23(%rsi), %rdx
mov %rdx, 23(%rdi)
#ifdef USE_AS_STPCPY
lea 30(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $31, %cl
sub $31, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
.p2align 4
LABEL(tail_31):
mov (%rsi), %rcx
mov %rcx, (%rdi)
mov 8(%rsi), %rdx
mov %rdx, 8(%rdi)
mov 16(%rsi), %rcx
mov %rcx, 16(%rdi)
mov 24(%rsi), %rdx
mov %rdx, 24(%rdi)
#ifdef USE_AS_STPCPY
lea 31(%rdi), %rax
#endif
#ifdef USE_AS_STRNCPY
mov $32, %cl
sub $32, %r8
jnz LABEL(strncpy_fill_tail)
#ifdef USE_AS_STPCPY
cmpb $1, (%rax)
sbb $-1, %rax
#endif
#endif
ret
cfi_endproc
.size STRCPY_SSSE3, .-STRCPY_SSSE3
.p2align 4
.section .rodata.ssse3,"a",@progbits
LABEL(tail_table):
.int LABEL(tail_0) - LABEL(tail_table)
.int LABEL(tail_1) - LABEL(tail_table)
.int LABEL(tail_2) - LABEL(tail_table)
.int LABEL(tail_3) - LABEL(tail_table)
.int LABEL(tail_4) - LABEL(tail_table)
.int LABEL(tail_5) - LABEL(tail_table)
.int LABEL(tail_6) - LABEL(tail_table)
.int LABEL(tail_7) - LABEL(tail_table)
.int LABEL(tail_8) - LABEL(tail_table)
.int LABEL(tail_9) - LABEL(tail_table)
.int LABEL(tail_10) - LABEL(tail_table)
.int LABEL(tail_11) - LABEL(tail_table)
.int LABEL(tail_12) - LABEL(tail_table)
.int LABEL(tail_13) - LABEL(tail_table)
.int LABEL(tail_14) - LABEL(tail_table)
.int LABEL(tail_15) - LABEL(tail_table)
.int LABEL(tail_16) - LABEL(tail_table)
.int LABEL(tail_17) - LABEL(tail_table)
.int LABEL(tail_18) - LABEL(tail_table)
.int LABEL(tail_19) - LABEL(tail_table)
.int LABEL(tail_20) - LABEL(tail_table)
.int LABEL(tail_21) - LABEL(tail_table)
.int LABEL(tail_22) - LABEL(tail_table)
.int LABEL(tail_23) - LABEL(tail_table)
.int LABEL(tail_24) - LABEL(tail_table)
.int LABEL(tail_25) - LABEL(tail_table)
.int LABEL(tail_26) - LABEL(tail_table)
.int LABEL(tail_27) - LABEL(tail_table)
.int LABEL(tail_28) - LABEL(tail_table)
.int LABEL(tail_29) - LABEL(tail_table)
.int LABEL(tail_30) - LABEL(tail_table)
.int LABEL(tail_31) - LABEL(tail_table)
.p2align 4
LABEL(unaligned_table):
.int LABEL(ashr_0) - LABEL(unaligned_table)
.int LABEL(ashr_1) - LABEL(unaligned_table)
.int LABEL(ashr_2) - LABEL(unaligned_table)
.int LABEL(ashr_3) - LABEL(unaligned_table)
.int LABEL(ashr_4) - LABEL(unaligned_table)
.int LABEL(ashr_5) - LABEL(unaligned_table)
.int LABEL(ashr_6) - LABEL(unaligned_table)
.int LABEL(ashr_7) - LABEL(unaligned_table)
.int LABEL(ashr_8) - LABEL(unaligned_table)
.int LABEL(ashr_9) - LABEL(unaligned_table)
.int LABEL(ashr_10) - LABEL(unaligned_table)
.int LABEL(ashr_11) - LABEL(unaligned_table)
.int LABEL(ashr_12) - LABEL(unaligned_table)
.int LABEL(ashr_13) - LABEL(unaligned_table)
.int LABEL(ashr_14) - LABEL(unaligned_table)
.int LABEL(ashr_15) - LABEL(unaligned_table)
# undef ENTRY
# define ENTRY(name) \
.type STRCPY_SSE2, @function; \
.align 16; \
STRCPY_SSE2: cfi_startproc; \
CALL_MCOUNT
# undef END
# define END(name) \
cfi_endproc; .size STRCPY_SSE2, .-STRCPY_SSE2
# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strcpy calls through a PLT.
The speedup we get from using SSSE3 instruction is likely eaten away
by the indirect call in the PLT. */
# define libc_hidden_builtin_def(name) \
.globl __GI_STRCPY; __GI_STRCPY = STRCPY_SSE2
# undef libc_hidden_def
# define libc_hidden_def(name) \
.globl __GI___STRCPY; __GI___STRCPY = STRCPY_SSE2
#endif
#ifndef USE_AS_STRNCPY
#include "../strcpy.S"
#endif