4f41c682f3
Replace with !IS_IN (libc). This completes the transition from the IS_IN/NOT_IN macros to the IN_MODULE macro set. The generated code is unchanged on x86_64. * stdlib/isomac.c (fmt): Replace NOT_IN_libc with IN_MODULE. (get_null_defines): Adjust. * sunrpc/Makefile: Adjust comment. * Makerules (CPPFLAGS-nonlib): Remove NOT_IN_libc. * elf/Makefile (CPPFLAGS-sotruss-lib): Likewise. (CFLAGS-interp.c): Likewise. (CFLAGS-ldconfig.c): Likewise. (CPPFLAGS-.os): Likewise. * elf/rtld-Rules (rtld-CPPFLAGS): Likewise. * extra-lib.mk (CPPFLAGS-$(lib)): Likewise. * extra-modules.mk (extra-modules.mk): Likewise. * iconv/Makefile (CPPFLAGS-iconvprogs): Likewise. * locale/Makefile (CPPFLAGS-locale_programs): Likewise. * malloc/Makefile (CPPFLAGS-memusagestat): Likewise. * nscd/Makefile (CPPFLAGS-nscd): Likewise. * nss/Makefile (CPPFLAGS-nss_test1): Likewise. * stdlib/Makefile (CFLAGS-tst-putenvmod.c): Likewise. * sysdeps/gnu/Makefile ($(objpfx)errlist-compat.c): Likewise. * sysdeps/unix/sysv/linux/Makefile (CPPFLAGS-lddlibc4): Likewise. * iconvdata/Makefile (CPPFLAGS): Likewise. (cpp-srcs-left): Add libof for all iconvdata routines. * bits/stdio-lock.h: Replace NOT_IN_libc with IS_IN. * include/assert.h: Likewise. * include/ctype.h: Likewise. * include/errno.h: Likewise. * include/libc-symbols.h: Likewise. * include/math.h: Likewise. * include/netdb.h: Likewise. * include/resolv.h: Likewise. * include/stdio.h: Likewise. * include/stdlib.h: Likewise. * include/string.h: Likewise. * include/sys/stat.h: Likewise. * include/wctype.h: Likewise. * intl/l10nflist.c: Likewise. * libidn/idn-stub.c: Likewise. * libio/libioP.h: Likewise. * nptl/libc_multiple_threads.c: Likewise. * nptl/pthreadP.h: Likewise. * posix/regex_internal.h: Likewise. * resolv/res_hconf.c: Likewise. * sysdeps/arm/armv7/multiarch/memcpy.S: Likewise. * sysdeps/arm/memmove.S: Likewise. * sysdeps/arm/sysdep.h: Likewise. * sysdeps/generic/_itoa.h: Likewise. * sysdeps/generic/symbol-hacks.h: Likewise. * sysdeps/gnu/errlist.awk: Likewise. * sysdeps/gnu/errlist.c: Likewise. * sysdeps/i386/i586/memcpy.S: Likewise. * sysdeps/i386/i586/memset.S: Likewise. * sysdeps/i386/i686/memcpy.S: Likewise. * sysdeps/i386/i686/memmove.S: Likewise. * sysdeps/i386/i686/mempcpy.S: Likewise. * sysdeps/i386/i686/memset.S: Likewise. * sysdeps/i386/i686/multiarch/bcopy.S: Likewise. * sysdeps/i386/i686/multiarch/bzero.S: Likewise. * sysdeps/i386/i686/multiarch/memchr-sse2-bsf.S: Likewise. * sysdeps/i386/i686/multiarch/memchr-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/memchr.S: Likewise. * sysdeps/i386/i686/multiarch/memcmp-sse4.S: Likewise. * sysdeps/i386/i686/multiarch/memcmp-ssse3.S: Likewise. * sysdeps/i386/i686/multiarch/memcmp.S: Likewise. * sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S: Likewise. * sysdeps/i386/i686/multiarch/memcpy-ssse3.S: Likewise. * sysdeps/i386/i686/multiarch/memcpy.S: Likewise. * sysdeps/i386/i686/multiarch/memcpy_chk.S: Likewise. * sysdeps/i386/i686/multiarch/memmove.S: Likewise. * sysdeps/i386/i686/multiarch/memmove_chk.S: Likewise. * sysdeps/i386/i686/multiarch/mempcpy.S: Likewise. * sysdeps/i386/i686/multiarch/mempcpy_chk.S: Likewise. * sysdeps/i386/i686/multiarch/memrchr-c.c: Likewise. * sysdeps/i386/i686/multiarch/memrchr-sse2-bsf.S: Likewise. * sysdeps/i386/i686/multiarch/memrchr-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/memrchr.S: Likewise. * sysdeps/i386/i686/multiarch/memset-sse2-rep.S: Likewise. * sysdeps/i386/i686/multiarch/memset-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/memset.S: Likewise. * sysdeps/i386/i686/multiarch/memset_chk.S: Likewise. * sysdeps/i386/i686/multiarch/rawmemchr.S: Likewise. * sysdeps/i386/i686/multiarch/strcat-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/strcat-ssse3.S: Likewise. * sysdeps/i386/i686/multiarch/strcat.S: Likewise. * sysdeps/i386/i686/multiarch/strchr-sse2-bsf.S: Likewise. * sysdeps/i386/i686/multiarch/strchr-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/strchr.S: Likewise. * sysdeps/i386/i686/multiarch/strcmp-sse4.S: Likewise. * sysdeps/i386/i686/multiarch/strcmp-ssse3.S: Likewise. * sysdeps/i386/i686/multiarch/strcmp.S: Likewise. * sysdeps/i386/i686/multiarch/strcpy-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/strcpy-ssse3.S: Likewise. * sysdeps/i386/i686/multiarch/strcpy.S: Likewise. * sysdeps/i386/i686/multiarch/strcspn.S: Likewise. * sysdeps/i386/i686/multiarch/strlen-sse2-bsf.S: Likewise. * sysdeps/i386/i686/multiarch/strlen-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/strlen.S: Likewise. * sysdeps/i386/i686/multiarch/strnlen.S: Likewise. * sysdeps/i386/i686/multiarch/strrchr-sse2-bsf.S: Likewise. * sysdeps/i386/i686/multiarch/strrchr-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/strrchr.S: Likewise. * sysdeps/i386/i686/multiarch/strspn.S: Likewise. * sysdeps/i386/i686/multiarch/wcschr-c.c: Likewise. * sysdeps/i386/i686/multiarch/wcschr-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/wcschr.S: Likewise. * sysdeps/i386/i686/multiarch/wcscmp-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/wcscmp.S: Likewise. * sysdeps/i386/i686/multiarch/wcscpy-c.c: Likewise. * sysdeps/i386/i686/multiarch/wcscpy-ssse3.S: Likewise. * sysdeps/i386/i686/multiarch/wcscpy.S: Likewise. * sysdeps/i386/i686/multiarch/wcslen-c.c: Likewise. * sysdeps/i386/i686/multiarch/wcslen-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/wcslen.S: Likewise. * sysdeps/i386/i686/multiarch/wcsrchr-c.c: Likewise. * sysdeps/i386/i686/multiarch/wcsrchr-sse2.S: Likewise. * sysdeps/i386/i686/multiarch/wcsrchr.S: Likewise. * sysdeps/i386/i686/multiarch/wmemcmp-c.c: Likewise. * sysdeps/i386/i686/multiarch/wmemcmp.S: Likewise. * sysdeps/ia64/fpu/libm-symbols.h: Likewise. * sysdeps/nptl/bits/libc-lock.h: Likewise. * sysdeps/nptl/bits/libc-lockP.h: Likewise. * sysdeps/nptl/bits/stdio-lock.h: Likewise. * sysdeps/posix/closedir.c: Likewise. * sysdeps/posix/opendir.c: Likewise. * sysdeps/posix/readdir.c: Likewise. * sysdeps/posix/rewinddir.c: Likewise. * sysdeps/powerpc/novmx-sigjmp.c: Likewise. * sysdeps/powerpc/powerpc32/__longjmp.S: Likewise. * sysdeps/powerpc/powerpc32/bsd-_setjmp.S: Likewise. * sysdeps/powerpc/powerpc32/fpu/__longjmp.S: Likewise. * sysdeps/powerpc/powerpc32/fpu/setjmp.S: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/bzero.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memchr.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memcmp-ppc32.S: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memcmp.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memcpy-ppc32.S: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memcpy.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memmove.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/mempcpy.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memrchr-ppc32.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memrchr.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memset-ppc32.S: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/memset.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/rawmemchr.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strcasecmp.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strcasecmp_l.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strchr.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strchrnul.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strlen-ppc32.S: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strlen.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strncase.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strncase_l.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strncmp-ppc32.S: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strncmp.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/strnlen.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wcschr-ppc32.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wcschr.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wcscpy-ppc32.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wcscpy.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wcsrchr-ppc32.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wcsrchr.c: Likewise. * sysdeps/powerpc/powerpc32/power4/multiarch/wordcopy.c: Likewise. * sysdeps/powerpc/powerpc32/power6/memset.S: Likewise. * sysdeps/powerpc/powerpc32/setjmp.S: Likewise. * sysdeps/powerpc/powerpc64/__longjmp.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/bzero.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memchr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memcmp-ppc64.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memcmp.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memcpy-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memcpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memmove-ppc64.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memmove.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/mempcpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memrchr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memset-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/memset.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/stpcpy-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/stpcpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/stpncpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcat.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strchr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strchrnul.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcmp-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcmp.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcpy-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strcspn.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strlen-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strlen.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncase.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncase_l.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncat.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncmp-ppc64.S: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncmp.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncpy-ppc64.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strncpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strnlen.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strpbrk.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strrchr-ppc64.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strrchr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strspn-ppc64.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/strspn.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/wcschr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/wcscpy.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/wcsrchr.c: Likewise. * sysdeps/powerpc/powerpc64/multiarch/wordcopy.c: Likewise. * sysdeps/powerpc/powerpc64/setjmp.S: Likewise. * sysdeps/s390/s390-32/multiarch/ifunc-resolve.c: Likewise. * sysdeps/s390/s390-32/multiarch/memcmp.S: Likewise. * sysdeps/s390/s390-32/multiarch/memcpy.S: Likewise. * sysdeps/s390/s390-32/multiarch/memset.S: Likewise. * sysdeps/s390/s390-64/multiarch/ifunc-resolve.c: Likewise. * sysdeps/s390/s390-64/multiarch/memcmp.S: Likewise. * sysdeps/s390/s390-64/multiarch/memcpy.S: Likewise. * sysdeps/s390/s390-64/multiarch/memset.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memcpy-niagara1.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memcpy-niagara2.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memcpy-niagara4.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memcpy-ultra3.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memcpy.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memset-niagara1.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memset-niagara4.S: Likewise. * sysdeps/sparc/sparc64/multiarch/memset.S: Likewise. * sysdeps/unix/alpha/sysdep.S: Likewise. * sysdeps/unix/alpha/sysdep.h: Likewise. * sysdeps/unix/make-syscalls.sh: Likewise. * sysdeps/unix/sysv/linux/aarch64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/aarch64/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/alpha/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/alpha/vfork.S: Likewise. * sysdeps/unix/sysv/linux/arm/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/arm/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/getpid.c: Likewise. * sysdeps/unix/sysv/linux/hppa/nptl/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/hppa/nptl/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/i386/i486/lowlevellock.S: Likewise. * sysdeps/unix/sysv/linux/i386/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/i386/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/i386/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/ia64/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/ia64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/ia64/sysdep.S: Likewise. * sysdeps/unix/sysv/linux/ia64/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/lowlevellock-futex.h: Likewise. * sysdeps/unix/sysv/linux/m68k/bits/m68k-vdso.h: Likewise. * sysdeps/unix/sysv/linux/m68k/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/m68k/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/microblaze/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/microblaze/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/mips/mips64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/mips/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/not-cancel.h: Likewise. * sysdeps/unix/sysv/linux/powerpc/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/powerpc/powerpc32/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/powerpc/powerpc64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/s390/longjmp_chk.c: Likewise. * sysdeps/unix/sysv/linux/s390/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/s390/s390-32/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/s390/s390-32/sysdep.S: Likewise. * sysdeps/unix/sysv/linux/s390/s390-32/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/s390/s390-32/vfork.S: Likewise. * sysdeps/unix/sysv/linux/s390/s390-64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/s390/s390-64/sysdep.S: Likewise. * sysdeps/unix/sysv/linux/s390/s390-64/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/s390/s390-64/vfork.S: Likewise. * sysdeps/unix/sysv/linux/sh/lowlevellock.S: Likewise. * sysdeps/unix/sysv/linux/sh/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/sh/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/sh/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/sh/vfork.S: Likewise. * sysdeps/unix/sysv/linux/sparc/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/sparc/sparc32/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/sparc/sparc32/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/sparc/sparc64/brk.S: Likewise. * sysdeps/unix/sysv/linux/sparc/sparc64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/sparc/sparc64/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/tile/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/tile/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/tile/sysdep.h: Likewise. * sysdeps/unix/sysv/linux/tile/waitpid.S: Likewise. * sysdeps/unix/sysv/linux/x86_64/lowlevellock.S: Likewise. * sysdeps/unix/sysv/linux/x86_64/lowlevellock.h: Likewise. * sysdeps/unix/sysv/linux/x86_64/sysdep-cancel.h: Likewise. * sysdeps/unix/sysv/linux/x86_64/sysdep.h: Likewise. * sysdeps/wordsize-32/symbol-hacks.h: Likewise. * sysdeps/x86_64/memcpy.S: Likewise. * sysdeps/x86_64/memmove.c: Likewise. * sysdeps/x86_64/memset.S: Likewise. * sysdeps/x86_64/multiarch/init-arch.h: Likewise. * sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise. * sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/memcmp.S: Likewise. * sysdeps/x86_64/multiarch/memcpy-avx-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Likewise. * sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/memcpy.S: Likewise. * sysdeps/x86_64/multiarch/memcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memmove.c: Likewise. * sysdeps/x86_64/multiarch/mempcpy.S: Likewise. * sysdeps/x86_64/multiarch/mempcpy_chk.S: Likewise. * sysdeps/x86_64/multiarch/memset-avx2.S: Likewise. * sysdeps/x86_64/multiarch/memset.S: Likewise. * sysdeps/x86_64/multiarch/memset_chk.S: Likewise. * sysdeps/x86_64/multiarch/strcat-sse2-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/strcat-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/strcat.S: Likewise. * sysdeps/x86_64/multiarch/strchr-sse2-no-bsf.S: Likewise. * sysdeps/x86_64/multiarch/strchr.S: Likewise. * sysdeps/x86_64/multiarch/strcmp-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/strcmp.S: Likewise. * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise. * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/strcpy.S: Likewise. * sysdeps/x86_64/multiarch/strcspn.S: Likewise. * sysdeps/x86_64/multiarch/strspn.S: Likewise. * sysdeps/x86_64/multiarch/wcscpy-c.c: Likewise. * sysdeps/x86_64/multiarch/wcscpy-ssse3.S: Likewise. * sysdeps/x86_64/multiarch/wcscpy.S: Likewise. * sysdeps/x86_64/multiarch/wmemcmp-c.c: Likewise. * sysdeps/x86_64/multiarch/wmemcmp.S: Likewise. * sysdeps/x86_64/strcmp.S: Likewise.
3163 lines
68 KiB
ArmAsm
3163 lines
68 KiB
ArmAsm
/* memcpy with SSSE3
|
|
Copyright (C) 2010-2014 Free Software Foundation, Inc.
|
|
Contributed by Intel Corporation.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#if IS_IN (libc) \
|
|
&& (defined SHARED \
|
|
|| defined USE_AS_MEMMOVE \
|
|
|| !defined USE_MULTIARCH)
|
|
|
|
# include <sysdep.h>
|
|
# include "asm-syntax.h"
|
|
|
|
# ifndef MEMCPY
|
|
# define MEMCPY __memcpy_ssse3
|
|
# define MEMCPY_CHK __memcpy_chk_ssse3
|
|
# endif
|
|
|
|
# ifdef USE_AS_BCOPY
|
|
# define SRC PARMS
|
|
# define DEST SRC+4
|
|
# define LEN DEST+4
|
|
# else
|
|
# define DEST PARMS
|
|
# define SRC DEST+4
|
|
# define LEN SRC+4
|
|
# endif
|
|
|
|
# define CFI_PUSH(REG) \
|
|
cfi_adjust_cfa_offset (4); \
|
|
cfi_rel_offset (REG, 0)
|
|
|
|
# define CFI_POP(REG) \
|
|
cfi_adjust_cfa_offset (-4); \
|
|
cfi_restore (REG)
|
|
|
|
# define PUSH(REG) pushl REG; CFI_PUSH (REG)
|
|
# define POP(REG) popl REG; CFI_POP (REG)
|
|
|
|
# ifdef SHARED
|
|
# define PARMS 8 /* Preserve EBX. */
|
|
# define ENTRANCE PUSH (%ebx);
|
|
# define RETURN_END POP (%ebx); ret
|
|
# define RETURN RETURN_END; CFI_PUSH (%ebx)
|
|
# define JMPTBL(I, B) I - B
|
|
|
|
/* Load an entry in a jump table into EBX and branch to it. TABLE is a
|
|
jump table with relative offsets. INDEX is a register contains the
|
|
index into the jump table. SCALE is the scale of INDEX. */
|
|
|
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
|
|
/* We first load PC into EBX. */ \
|
|
SETUP_PIC_REG(bx); \
|
|
/* Get the address of the jump table. */ \
|
|
addl $(TABLE - .), %ebx; \
|
|
/* Get the entry and convert the relative offset to the \
|
|
absolute address. */ \
|
|
addl (%ebx, INDEX, SCALE), %ebx; \
|
|
/* We loaded the jump table. Go. */ \
|
|
jmp *%ebx
|
|
# else
|
|
|
|
# define PARMS 4
|
|
# define ENTRANCE
|
|
# define RETURN_END ret
|
|
# define RETURN RETURN_END
|
|
# define JMPTBL(I, B) I
|
|
|
|
/* Branch to an entry in a jump table. TABLE is a jump table with
|
|
absolute offsets. INDEX is a register contains the index into the
|
|
jump table. SCALE is the scale of INDEX. */
|
|
|
|
# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
|
|
jmp *TABLE(, INDEX, SCALE)
|
|
# endif
|
|
|
|
.section .text.ssse3,"ax",@progbits
|
|
# if !defined USE_AS_BCOPY
|
|
ENTRY (MEMCPY_CHK)
|
|
movl 12(%esp), %eax
|
|
cmpl %eax, 16(%esp)
|
|
jb HIDDEN_JUMPTARGET (__chk_fail)
|
|
END (MEMCPY_CHK)
|
|
# endif
|
|
ENTRY (MEMCPY)
|
|
ENTRANCE
|
|
movl LEN(%esp), %ecx
|
|
movl SRC(%esp), %eax
|
|
movl DEST(%esp), %edx
|
|
|
|
# ifdef USE_AS_MEMMOVE
|
|
cmp %eax, %edx
|
|
jb L(copy_forward)
|
|
je L(fwd_write_0bytes)
|
|
cmp $32, %ecx
|
|
jae L(memmove_bwd)
|
|
jmp L(bk_write_less32bytes_2)
|
|
|
|
.p2align 4
|
|
L(memmove_bwd):
|
|
add %ecx, %eax
|
|
cmp %eax, %edx
|
|
movl SRC(%esp), %eax
|
|
jb L(copy_backward)
|
|
|
|
L(copy_forward):
|
|
# endif
|
|
cmp $48, %ecx
|
|
jae L(48bytesormore)
|
|
|
|
L(fwd_write_less32bytes):
|
|
# ifndef USE_AS_MEMMOVE
|
|
cmp %dl, %al
|
|
jb L(bk_write)
|
|
# endif
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
|
# ifndef USE_AS_MEMMOVE
|
|
.p2align 4
|
|
L(bk_write):
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
|
|
# endif
|
|
|
|
.p2align 4
|
|
L(48bytesormore):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movlpd (%eax), %xmm0
|
|
movlpd 8(%eax), %xmm1
|
|
movlpd %xmm0, (%edx)
|
|
movlpd %xmm1, 8(%edx)
|
|
# else
|
|
movdqu (%eax), %xmm0
|
|
# endif
|
|
PUSH (%edi)
|
|
movl %edx, %edi
|
|
and $-16, %edx
|
|
add $16, %edx
|
|
sub %edx, %edi
|
|
add %edi, %ecx
|
|
sub %edi, %eax
|
|
|
|
# ifdef SHARED_CACHE_SIZE_HALF
|
|
cmp $SHARED_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_shared_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_shared_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
|
|
mov %eax, %edi
|
|
jae L(large_page)
|
|
and $0xf, %edi
|
|
jz L(shl_0)
|
|
BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4)
|
|
|
|
.p2align 4
|
|
L(shl_0):
|
|
# ifdef USE_AS_MEMMOVE
|
|
movl DEST+4(%esp), %edi
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
xor %edi, %edi
|
|
cmp $127, %ecx
|
|
ja L(shl_0_gobble)
|
|
lea -32(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(shl_0_loop):
|
|
movdqa (%eax, %edi), %xmm0
|
|
movdqa 16(%eax, %edi), %xmm1
|
|
sub $32, %ecx
|
|
movdqa %xmm0, (%edx, %edi)
|
|
movdqa %xmm1, 16(%edx, %edi)
|
|
lea 32(%edi), %edi
|
|
jb L(shl_0_end)
|
|
|
|
movdqa (%eax, %edi), %xmm0
|
|
movdqa 16(%eax, %edi), %xmm1
|
|
sub $32, %ecx
|
|
movdqa %xmm0, (%edx, %edi)
|
|
movdqa %xmm1, 16(%edx, %edi)
|
|
lea 32(%edi), %edi
|
|
jb L(shl_0_end)
|
|
|
|
movdqa (%eax, %edi), %xmm0
|
|
movdqa 16(%eax, %edi), %xmm1
|
|
sub $32, %ecx
|
|
movdqa %xmm0, (%edx, %edi)
|
|
movdqa %xmm1, 16(%edx, %edi)
|
|
lea 32(%edi), %edi
|
|
jb L(shl_0_end)
|
|
|
|
movdqa (%eax, %edi), %xmm0
|
|
movdqa 16(%eax, %edi), %xmm1
|
|
sub $32, %ecx
|
|
movdqa %xmm0, (%edx, %edi)
|
|
movdqa %xmm1, 16(%edx, %edi)
|
|
lea 32(%edi), %edi
|
|
|
|
L(shl_0_end):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
add %edi, %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_0_gobble):
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
POP (%edi)
|
|
lea -128(%ecx), %ecx
|
|
jae L(shl_0_gobble_mem_loop)
|
|
|
|
.p2align 4
|
|
L(shl_0_gobble_cache_loop):
|
|
movdqa (%eax), %xmm0
|
|
movdqa 0x10(%eax), %xmm1
|
|
movdqa 0x20(%eax), %xmm2
|
|
movdqa 0x30(%eax), %xmm3
|
|
movdqa 0x40(%eax), %xmm4
|
|
movdqa 0x50(%eax), %xmm5
|
|
movdqa 0x60(%eax), %xmm6
|
|
movdqa 0x70(%eax), %xmm7
|
|
lea 0x80(%eax), %eax
|
|
sub $128, %ecx
|
|
movdqa %xmm0, (%edx)
|
|
movdqa %xmm1, 0x10(%edx)
|
|
movdqa %xmm2, 0x20(%edx)
|
|
movdqa %xmm3, 0x30(%edx)
|
|
movdqa %xmm4, 0x40(%edx)
|
|
movdqa %xmm5, 0x50(%edx)
|
|
movdqa %xmm6, 0x60(%edx)
|
|
movdqa %xmm7, 0x70(%edx)
|
|
lea 0x80(%edx), %edx
|
|
|
|
jae L(shl_0_gobble_cache_loop)
|
|
cmp $-0x40, %ecx
|
|
lea 0x80(%ecx), %ecx
|
|
jl L(shl_0_cache_less_64bytes)
|
|
|
|
movdqa (%eax), %xmm0
|
|
sub $0x40, %ecx
|
|
movdqa 0x10(%eax), %xmm1
|
|
movdqa %xmm0, (%edx)
|
|
movdqa %xmm1, 0x10(%edx)
|
|
movdqa 0x20(%eax), %xmm0
|
|
movdqa 0x30(%eax), %xmm1
|
|
add $0x40, %eax
|
|
movdqa %xmm0, 0x20(%edx)
|
|
movdqa %xmm1, 0x30(%edx)
|
|
add $0x40, %edx
|
|
|
|
L(shl_0_cache_less_64bytes):
|
|
cmp $0x20, %ecx
|
|
jb L(shl_0_cache_less_32bytes)
|
|
movdqa (%eax), %xmm0
|
|
sub $0x20, %ecx
|
|
movdqa 0x10(%eax), %xmm1
|
|
add $0x20, %eax
|
|
movdqa %xmm0, (%edx)
|
|
movdqa %xmm1, 0x10(%edx)
|
|
add $0x20, %edx
|
|
|
|
L(shl_0_cache_less_32bytes):
|
|
cmp $0x10, %ecx
|
|
jb L(shl_0_cache_less_16bytes)
|
|
sub $0x10, %ecx
|
|
movdqa (%eax), %xmm0
|
|
add $0x10, %eax
|
|
movdqa %xmm0, (%edx)
|
|
add $0x10, %edx
|
|
|
|
L(shl_0_cache_less_16bytes):
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
.p2align 4
|
|
L(shl_0_gobble_mem_loop):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x280(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
|
|
movdqa (%eax), %xmm0
|
|
movdqa 0x10(%eax), %xmm1
|
|
movdqa 0x20(%eax), %xmm2
|
|
movdqa 0x30(%eax), %xmm3
|
|
movdqa 0x40(%eax), %xmm4
|
|
movdqa 0x50(%eax), %xmm5
|
|
movdqa 0x60(%eax), %xmm6
|
|
movdqa 0x70(%eax), %xmm7
|
|
lea 0x80(%eax), %eax
|
|
sub $0x80, %ecx
|
|
movdqa %xmm0, (%edx)
|
|
movdqa %xmm1, 0x10(%edx)
|
|
movdqa %xmm2, 0x20(%edx)
|
|
movdqa %xmm3, 0x30(%edx)
|
|
movdqa %xmm4, 0x40(%edx)
|
|
movdqa %xmm5, 0x50(%edx)
|
|
movdqa %xmm6, 0x60(%edx)
|
|
movdqa %xmm7, 0x70(%edx)
|
|
lea 0x80(%edx), %edx
|
|
|
|
jae L(shl_0_gobble_mem_loop)
|
|
cmp $-0x40, %ecx
|
|
lea 0x80(%ecx), %ecx
|
|
jl L(shl_0_mem_less_64bytes)
|
|
|
|
movdqa (%eax), %xmm0
|
|
sub $0x40, %ecx
|
|
movdqa 0x10(%eax), %xmm1
|
|
|
|
movdqa %xmm0, (%edx)
|
|
movdqa %xmm1, 0x10(%edx)
|
|
|
|
movdqa 0x20(%eax), %xmm0
|
|
movdqa 0x30(%eax), %xmm1
|
|
add $0x40, %eax
|
|
|
|
movdqa %xmm0, 0x20(%edx)
|
|
movdqa %xmm1, 0x30(%edx)
|
|
add $0x40, %edx
|
|
|
|
L(shl_0_mem_less_64bytes):
|
|
cmp $0x20, %ecx
|
|
jb L(shl_0_mem_less_32bytes)
|
|
movdqa (%eax), %xmm0
|
|
sub $0x20, %ecx
|
|
movdqa 0x10(%eax), %xmm1
|
|
add $0x20, %eax
|
|
movdqa %xmm0, (%edx)
|
|
movdqa %xmm1, 0x10(%edx)
|
|
add $0x20, %edx
|
|
|
|
L(shl_0_mem_less_32bytes):
|
|
cmp $0x10, %ecx
|
|
jb L(shl_0_mem_less_16bytes)
|
|
sub $0x10, %ecx
|
|
movdqa (%eax), %xmm0
|
|
add $0x10, %eax
|
|
movdqa %xmm0, (%edx)
|
|
add $0x10, %edx
|
|
|
|
L(shl_0_mem_less_16bytes):
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd_align), %ecx, 4)
|
|
|
|
.p2align 4
|
|
L(shl_1):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -1(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -1(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_1_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl1LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 15(%eax), %xmm2
|
|
movaps 31(%eax), %xmm3
|
|
movaps 47(%eax), %xmm4
|
|
movaps 63(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $1, %xmm4, %xmm5
|
|
palignr $1, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $1, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $1, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl1LoopStart)
|
|
|
|
L(Shl1LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 15(%eax), %xmm2
|
|
movaps 31(%eax), %xmm3
|
|
palignr $1, %xmm2, %xmm3
|
|
palignr $1, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_1_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -1(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_1_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $1, %xmm2, %xmm3
|
|
palignr $1, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_1_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $1, %xmm2, %xmm3
|
|
palignr $1, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_1_no_prefetch_loop)
|
|
|
|
L(sh_1_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 1(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_2):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -2(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -2(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_2_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl2LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 14(%eax), %xmm2
|
|
movaps 30(%eax), %xmm3
|
|
movaps 46(%eax), %xmm4
|
|
movaps 62(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $2, %xmm4, %xmm5
|
|
palignr $2, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $2, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $2, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl2LoopStart)
|
|
|
|
L(Shl2LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 14(%eax), %xmm2
|
|
movaps 30(%eax), %xmm3
|
|
palignr $2, %xmm2, %xmm3
|
|
palignr $2, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_2_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -2(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_2_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $2, %xmm2, %xmm3
|
|
palignr $2, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_2_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $2, %xmm2, %xmm3
|
|
palignr $2, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_2_no_prefetch_loop)
|
|
|
|
L(sh_2_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 2(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_3):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -3(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -3(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_3_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl3LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 13(%eax), %xmm2
|
|
movaps 29(%eax), %xmm3
|
|
movaps 45(%eax), %xmm4
|
|
movaps 61(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $3, %xmm4, %xmm5
|
|
palignr $3, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $3, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $3, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl3LoopStart)
|
|
|
|
L(Shl3LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 13(%eax), %xmm2
|
|
movaps 29(%eax), %xmm3
|
|
palignr $3, %xmm2, %xmm3
|
|
palignr $3, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_3_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -3(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_3_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $3, %xmm2, %xmm3
|
|
palignr $3, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jb L(sh_3_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $3, %xmm2, %xmm3
|
|
palignr $3, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jae L(sh_3_no_prefetch_loop)
|
|
|
|
L(sh_3_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 3(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_4):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -4(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -4(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_4_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl4LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 12(%eax), %xmm2
|
|
movaps 28(%eax), %xmm3
|
|
movaps 44(%eax), %xmm4
|
|
movaps 60(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $4, %xmm4, %xmm5
|
|
palignr $4, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $4, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl4LoopStart)
|
|
|
|
L(Shl4LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 12(%eax), %xmm2
|
|
movaps 28(%eax), %xmm3
|
|
palignr $4, %xmm2, %xmm3
|
|
palignr $4, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_4_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -4(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_4_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $4, %xmm2, %xmm3
|
|
palignr $4, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jb L(sh_4_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $4, %xmm2, %xmm3
|
|
palignr $4, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jae L(sh_4_no_prefetch_loop)
|
|
|
|
L(sh_4_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 4(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_5):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -5(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -5(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_5_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl5LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 11(%eax), %xmm2
|
|
movaps 27(%eax), %xmm3
|
|
movaps 43(%eax), %xmm4
|
|
movaps 59(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $5, %xmm4, %xmm5
|
|
palignr $5, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $5, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $5, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl5LoopStart)
|
|
|
|
L(Shl5LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 11(%eax), %xmm2
|
|
movaps 27(%eax), %xmm3
|
|
palignr $5, %xmm2, %xmm3
|
|
palignr $5, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_5_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -5(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_5_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $5, %xmm2, %xmm3
|
|
palignr $5, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jb L(sh_5_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $5, %xmm2, %xmm3
|
|
palignr $5, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jae L(sh_5_no_prefetch_loop)
|
|
|
|
L(sh_5_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 5(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_6):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -6(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -6(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_6_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl6LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 10(%eax), %xmm2
|
|
movaps 26(%eax), %xmm3
|
|
movaps 42(%eax), %xmm4
|
|
movaps 58(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $6, %xmm4, %xmm5
|
|
palignr $6, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $6, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $6, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl6LoopStart)
|
|
|
|
L(Shl6LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 10(%eax), %xmm2
|
|
movaps 26(%eax), %xmm3
|
|
palignr $6, %xmm2, %xmm3
|
|
palignr $6, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_6_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -6(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_6_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $6, %xmm2, %xmm3
|
|
palignr $6, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jb L(sh_6_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $6, %xmm2, %xmm3
|
|
palignr $6, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
|
|
jae L(sh_6_no_prefetch_loop)
|
|
|
|
L(sh_6_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 6(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_7):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -7(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -7(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_7_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl7LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 9(%eax), %xmm2
|
|
movaps 25(%eax), %xmm3
|
|
movaps 41(%eax), %xmm4
|
|
movaps 57(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $7, %xmm4, %xmm5
|
|
palignr $7, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $7, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $7, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl7LoopStart)
|
|
|
|
L(Shl7LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 9(%eax), %xmm2
|
|
movaps 25(%eax), %xmm3
|
|
palignr $7, %xmm2, %xmm3
|
|
palignr $7, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_7_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -7(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_7_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $7, %xmm2, %xmm3
|
|
palignr $7, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_7_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $7, %xmm2, %xmm3
|
|
palignr $7, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_7_no_prefetch_loop)
|
|
|
|
L(sh_7_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 7(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_8):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -8(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -8(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_8_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl8LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 8(%eax), %xmm2
|
|
movaps 24(%eax), %xmm3
|
|
movaps 40(%eax), %xmm4
|
|
movaps 56(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $8, %xmm4, %xmm5
|
|
palignr $8, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $8, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl8LoopStart)
|
|
|
|
L(LoopLeave8):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 8(%eax), %xmm2
|
|
movaps 24(%eax), %xmm3
|
|
palignr $8, %xmm2, %xmm3
|
|
palignr $8, %xmm1, %xmm2
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_8_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -8(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_8_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $8, %xmm2, %xmm3
|
|
palignr $8, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_8_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $8, %xmm2, %xmm3
|
|
palignr $8, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_8_no_prefetch_loop)
|
|
|
|
L(sh_8_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 8(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_9):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -9(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -9(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_9_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl9LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 7(%eax), %xmm2
|
|
movaps 23(%eax), %xmm3
|
|
movaps 39(%eax), %xmm4
|
|
movaps 55(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $9, %xmm4, %xmm5
|
|
palignr $9, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $9, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $9, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl9LoopStart)
|
|
|
|
L(Shl9LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 7(%eax), %xmm2
|
|
movaps 23(%eax), %xmm3
|
|
palignr $9, %xmm2, %xmm3
|
|
palignr $9, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_9_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -9(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_9_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $9, %xmm2, %xmm3
|
|
palignr $9, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_9_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $9, %xmm2, %xmm3
|
|
palignr $9, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_9_no_prefetch_loop)
|
|
|
|
L(sh_9_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 9(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_10):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -10(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -10(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_10_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl10LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 6(%eax), %xmm2
|
|
movaps 22(%eax), %xmm3
|
|
movaps 38(%eax), %xmm4
|
|
movaps 54(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $10, %xmm4, %xmm5
|
|
palignr $10, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $10, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $10, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl10LoopStart)
|
|
|
|
L(Shl10LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 6(%eax), %xmm2
|
|
movaps 22(%eax), %xmm3
|
|
palignr $10, %xmm2, %xmm3
|
|
palignr $10, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_10_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -10(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_10_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $10, %xmm2, %xmm3
|
|
palignr $10, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_10_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $10, %xmm2, %xmm3
|
|
palignr $10, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_10_no_prefetch_loop)
|
|
|
|
L(sh_10_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 10(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_11):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -11(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -11(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_11_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl11LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 5(%eax), %xmm2
|
|
movaps 21(%eax), %xmm3
|
|
movaps 37(%eax), %xmm4
|
|
movaps 53(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $11, %xmm4, %xmm5
|
|
palignr $11, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $11, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $11, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl11LoopStart)
|
|
|
|
L(Shl11LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 5(%eax), %xmm2
|
|
movaps 21(%eax), %xmm3
|
|
palignr $11, %xmm2, %xmm3
|
|
palignr $11, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_11_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -11(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_11_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $11, %xmm2, %xmm3
|
|
palignr $11, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_11_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $11, %xmm2, %xmm3
|
|
palignr $11, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_11_no_prefetch_loop)
|
|
|
|
L(sh_11_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 11(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_12):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -12(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -12(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_12_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl12LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 4(%eax), %xmm2
|
|
movaps 20(%eax), %xmm3
|
|
movaps 36(%eax), %xmm4
|
|
movaps 52(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $12, %xmm4, %xmm5
|
|
palignr $12, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $12, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $12, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl12LoopStart)
|
|
|
|
L(Shl12LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 4(%eax), %xmm2
|
|
movaps 20(%eax), %xmm3
|
|
palignr $12, %xmm2, %xmm3
|
|
palignr $12, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_12_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -12(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_12_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $12, %xmm2, %xmm3
|
|
palignr $12, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_12_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $12, %xmm2, %xmm3
|
|
palignr $12, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_12_no_prefetch_loop)
|
|
|
|
L(sh_12_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 12(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_13):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -13(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -13(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_13_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl13LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 3(%eax), %xmm2
|
|
movaps 19(%eax), %xmm3
|
|
movaps 35(%eax), %xmm4
|
|
movaps 51(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $13, %xmm4, %xmm5
|
|
palignr $13, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $13, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $13, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl13LoopStart)
|
|
|
|
L(Shl13LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 3(%eax), %xmm2
|
|
movaps 19(%eax), %xmm3
|
|
palignr $13, %xmm2, %xmm3
|
|
palignr $13, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_13_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -13(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_13_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $13, %xmm2, %xmm3
|
|
palignr $13, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_13_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $13, %xmm2, %xmm3
|
|
palignr $13, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_13_no_prefetch_loop)
|
|
|
|
L(sh_13_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 13(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_14):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -14(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -14(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_14_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl14LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 2(%eax), %xmm2
|
|
movaps 18(%eax), %xmm3
|
|
movaps 34(%eax), %xmm4
|
|
movaps 50(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $14, %xmm4, %xmm5
|
|
palignr $14, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $14, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $14, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl14LoopStart)
|
|
|
|
L(Shl14LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 2(%eax), %xmm2
|
|
movaps 18(%eax), %xmm3
|
|
palignr $14, %xmm2, %xmm3
|
|
palignr $14, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_14_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -14(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_14_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $14, %xmm2, %xmm3
|
|
palignr $14, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_14_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $14, %xmm2, %xmm3
|
|
palignr $14, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_14_no_prefetch_loop)
|
|
|
|
L(sh_14_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 14(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_15):
|
|
# ifndef USE_AS_MEMMOVE
|
|
movaps -15(%eax), %xmm1
|
|
# else
|
|
movl DEST+4(%esp), %edi
|
|
movaps -15(%eax), %xmm1
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
# ifdef DATA_CACHE_SIZE_HALF
|
|
cmp $DATA_CACHE_SIZE_HALF, %ecx
|
|
# else
|
|
# ifdef SHARED
|
|
SETUP_PIC_REG(bx)
|
|
add $_GLOBAL_OFFSET_TABLE_, %ebx
|
|
cmp __x86_data_cache_size_half@GOTOFF(%ebx), %ecx
|
|
# else
|
|
cmp __x86_data_cache_size_half, %ecx
|
|
# endif
|
|
# endif
|
|
jb L(sh_15_no_prefetch)
|
|
|
|
lea -64(%ecx), %ecx
|
|
|
|
.p2align 4
|
|
L(Shl15LoopStart):
|
|
prefetcht0 0x1c0(%eax)
|
|
prefetcht0 0x1c0(%edx)
|
|
movaps 1(%eax), %xmm2
|
|
movaps 17(%eax), %xmm3
|
|
movaps 33(%eax), %xmm4
|
|
movaps 49(%eax), %xmm5
|
|
movaps %xmm5, %xmm7
|
|
palignr $15, %xmm4, %xmm5
|
|
palignr $15, %xmm3, %xmm4
|
|
movaps %xmm5, 48(%edx)
|
|
palignr $15, %xmm2, %xmm3
|
|
lea 64(%eax), %eax
|
|
palignr $15, %xmm1, %xmm2
|
|
movaps %xmm4, 32(%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
movaps %xmm7, %xmm1
|
|
movaps %xmm2, (%edx)
|
|
lea 64(%edx), %edx
|
|
sub $64, %ecx
|
|
ja L(Shl15LoopStart)
|
|
|
|
L(Shl15LoopLeave):
|
|
add $32, %ecx
|
|
jle L(shl_end_0)
|
|
|
|
movaps 1(%eax), %xmm2
|
|
movaps 17(%eax), %xmm3
|
|
palignr $15, %xmm2, %xmm3
|
|
palignr $15, %xmm1, %xmm2
|
|
|
|
movaps %xmm2, (%edx)
|
|
movaps %xmm3, 16(%edx)
|
|
lea 32(%edx, %ecx), %edx
|
|
lea 32(%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(sh_15_no_prefetch):
|
|
lea -32(%ecx), %ecx
|
|
lea -15(%eax), %eax
|
|
xor %edi, %edi
|
|
|
|
.p2align 4
|
|
L(sh_15_no_prefetch_loop):
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm4
|
|
palignr $15, %xmm2, %xmm3
|
|
palignr $15, %xmm1, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jb L(sh_15_end_no_prefetch_loop)
|
|
|
|
movdqa 16(%eax, %edi), %xmm2
|
|
sub $32, %ecx
|
|
movdqa 32(%eax, %edi), %xmm3
|
|
movdqa %xmm3, %xmm1
|
|
palignr $15, %xmm2, %xmm3
|
|
palignr $15, %xmm4, %xmm2
|
|
lea 32(%edi), %edi
|
|
movdqa %xmm2, -32(%edx, %edi)
|
|
movdqa %xmm3, -16(%edx, %edi)
|
|
jae L(sh_15_no_prefetch_loop)
|
|
|
|
L(sh_15_end_no_prefetch_loop):
|
|
lea 32(%ecx), %ecx
|
|
add %ecx, %edi
|
|
add %edi, %edx
|
|
lea 15(%edi, %eax), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(shl_end_0):
|
|
lea 32(%ecx), %ecx
|
|
lea (%edx, %ecx), %edx
|
|
lea (%eax, %ecx), %eax
|
|
POP (%edi)
|
|
BRANCH_TO_JMPTBL_ENTRY(L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
.p2align 4
|
|
L(fwd_write_44bytes):
|
|
movq -44(%eax), %xmm0
|
|
movq %xmm0, -44(%edx)
|
|
L(fwd_write_36bytes):
|
|
movq -36(%eax), %xmm0
|
|
movq %xmm0, -36(%edx)
|
|
L(fwd_write_28bytes):
|
|
movq -28(%eax), %xmm0
|
|
movq %xmm0, -28(%edx)
|
|
L(fwd_write_20bytes):
|
|
movq -20(%eax), %xmm0
|
|
movq %xmm0, -20(%edx)
|
|
L(fwd_write_12bytes):
|
|
movq -12(%eax), %xmm0
|
|
movq %xmm0, -12(%edx)
|
|
L(fwd_write_4bytes):
|
|
movl -4(%eax), %ecx
|
|
movl %ecx, -4(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_40bytes):
|
|
movq -40(%eax), %xmm0
|
|
movq %xmm0, -40(%edx)
|
|
L(fwd_write_32bytes):
|
|
movq -32(%eax), %xmm0
|
|
movq %xmm0, -32(%edx)
|
|
L(fwd_write_24bytes):
|
|
movq -24(%eax), %xmm0
|
|
movq %xmm0, -24(%edx)
|
|
L(fwd_write_16bytes):
|
|
movq -16(%eax), %xmm0
|
|
movq %xmm0, -16(%edx)
|
|
L(fwd_write_8bytes):
|
|
movq -8(%eax), %xmm0
|
|
movq %xmm0, -8(%edx)
|
|
L(fwd_write_0bytes):
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_5bytes):
|
|
movl -5(%eax), %ecx
|
|
movl -4(%eax), %eax
|
|
movl %ecx, -5(%edx)
|
|
movl %eax, -4(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_45bytes):
|
|
movq -45(%eax), %xmm0
|
|
movq %xmm0, -45(%edx)
|
|
L(fwd_write_37bytes):
|
|
movq -37(%eax), %xmm0
|
|
movq %xmm0, -37(%edx)
|
|
L(fwd_write_29bytes):
|
|
movq -29(%eax), %xmm0
|
|
movq %xmm0, -29(%edx)
|
|
L(fwd_write_21bytes):
|
|
movq -21(%eax), %xmm0
|
|
movq %xmm0, -21(%edx)
|
|
L(fwd_write_13bytes):
|
|
movq -13(%eax), %xmm0
|
|
movq %xmm0, -13(%edx)
|
|
movl -5(%eax), %ecx
|
|
movl %ecx, -5(%edx)
|
|
movzbl -1(%eax), %ecx
|
|
movb %cl, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_41bytes):
|
|
movq -41(%eax), %xmm0
|
|
movq %xmm0, -41(%edx)
|
|
L(fwd_write_33bytes):
|
|
movq -33(%eax), %xmm0
|
|
movq %xmm0, -33(%edx)
|
|
L(fwd_write_25bytes):
|
|
movq -25(%eax), %xmm0
|
|
movq %xmm0, -25(%edx)
|
|
L(fwd_write_17bytes):
|
|
movq -17(%eax), %xmm0
|
|
movq %xmm0, -17(%edx)
|
|
L(fwd_write_9bytes):
|
|
movq -9(%eax), %xmm0
|
|
movq %xmm0, -9(%edx)
|
|
L(fwd_write_1bytes):
|
|
movzbl -1(%eax), %ecx
|
|
movb %cl, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_46bytes):
|
|
movq -46(%eax), %xmm0
|
|
movq %xmm0, -46(%edx)
|
|
L(fwd_write_38bytes):
|
|
movq -38(%eax), %xmm0
|
|
movq %xmm0, -38(%edx)
|
|
L(fwd_write_30bytes):
|
|
movq -30(%eax), %xmm0
|
|
movq %xmm0, -30(%edx)
|
|
L(fwd_write_22bytes):
|
|
movq -22(%eax), %xmm0
|
|
movq %xmm0, -22(%edx)
|
|
L(fwd_write_14bytes):
|
|
movq -14(%eax), %xmm0
|
|
movq %xmm0, -14(%edx)
|
|
L(fwd_write_6bytes):
|
|
movl -6(%eax), %ecx
|
|
movl %ecx, -6(%edx)
|
|
movzwl -2(%eax), %ecx
|
|
movw %cx, -2(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_42bytes):
|
|
movq -42(%eax), %xmm0
|
|
movq %xmm0, -42(%edx)
|
|
L(fwd_write_34bytes):
|
|
movq -34(%eax), %xmm0
|
|
movq %xmm0, -34(%edx)
|
|
L(fwd_write_26bytes):
|
|
movq -26(%eax), %xmm0
|
|
movq %xmm0, -26(%edx)
|
|
L(fwd_write_18bytes):
|
|
movq -18(%eax), %xmm0
|
|
movq %xmm0, -18(%edx)
|
|
L(fwd_write_10bytes):
|
|
movq -10(%eax), %xmm0
|
|
movq %xmm0, -10(%edx)
|
|
L(fwd_write_2bytes):
|
|
movzwl -2(%eax), %ecx
|
|
movw %cx, -2(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_47bytes):
|
|
movq -47(%eax), %xmm0
|
|
movq %xmm0, -47(%edx)
|
|
L(fwd_write_39bytes):
|
|
movq -39(%eax), %xmm0
|
|
movq %xmm0, -39(%edx)
|
|
L(fwd_write_31bytes):
|
|
movq -31(%eax), %xmm0
|
|
movq %xmm0, -31(%edx)
|
|
L(fwd_write_23bytes):
|
|
movq -23(%eax), %xmm0
|
|
movq %xmm0, -23(%edx)
|
|
L(fwd_write_15bytes):
|
|
movq -15(%eax), %xmm0
|
|
movq %xmm0, -15(%edx)
|
|
L(fwd_write_7bytes):
|
|
movl -7(%eax), %ecx
|
|
movl %ecx, -7(%edx)
|
|
movzwl -3(%eax), %ecx
|
|
movzbl -1(%eax), %eax
|
|
movw %cx, -3(%edx)
|
|
movb %al, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_43bytes):
|
|
movq -43(%eax), %xmm0
|
|
movq %xmm0, -43(%edx)
|
|
L(fwd_write_35bytes):
|
|
movq -35(%eax), %xmm0
|
|
movq %xmm0, -35(%edx)
|
|
L(fwd_write_27bytes):
|
|
movq -27(%eax), %xmm0
|
|
movq %xmm0, -27(%edx)
|
|
L(fwd_write_19bytes):
|
|
movq -19(%eax), %xmm0
|
|
movq %xmm0, -19(%edx)
|
|
L(fwd_write_11bytes):
|
|
movq -11(%eax), %xmm0
|
|
movq %xmm0, -11(%edx)
|
|
L(fwd_write_3bytes):
|
|
movzwl -3(%eax), %ecx
|
|
movzbl -1(%eax), %eax
|
|
movw %cx, -3(%edx)
|
|
movb %al, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_40bytes_align):
|
|
movdqa -40(%eax), %xmm0
|
|
movdqa %xmm0, -40(%edx)
|
|
L(fwd_write_24bytes_align):
|
|
movdqa -24(%eax), %xmm0
|
|
movdqa %xmm0, -24(%edx)
|
|
L(fwd_write_8bytes_align):
|
|
movq -8(%eax), %xmm0
|
|
movq %xmm0, -8(%edx)
|
|
L(fwd_write_0bytes_align):
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_32bytes_align):
|
|
movdqa -32(%eax), %xmm0
|
|
movdqa %xmm0, -32(%edx)
|
|
L(fwd_write_16bytes_align):
|
|
movdqa -16(%eax), %xmm0
|
|
movdqa %xmm0, -16(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_5bytes_align):
|
|
movl -5(%eax), %ecx
|
|
movl -4(%eax), %eax
|
|
movl %ecx, -5(%edx)
|
|
movl %eax, -4(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_45bytes_align):
|
|
movdqa -45(%eax), %xmm0
|
|
movdqa %xmm0, -45(%edx)
|
|
L(fwd_write_29bytes_align):
|
|
movdqa -29(%eax), %xmm0
|
|
movdqa %xmm0, -29(%edx)
|
|
L(fwd_write_13bytes_align):
|
|
movq -13(%eax), %xmm0
|
|
movq %xmm0, -13(%edx)
|
|
movl -5(%eax), %ecx
|
|
movl %ecx, -5(%edx)
|
|
movzbl -1(%eax), %ecx
|
|
movb %cl, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_37bytes_align):
|
|
movdqa -37(%eax), %xmm0
|
|
movdqa %xmm0, -37(%edx)
|
|
L(fwd_write_21bytes_align):
|
|
movdqa -21(%eax), %xmm0
|
|
movdqa %xmm0, -21(%edx)
|
|
movl -5(%eax), %ecx
|
|
movl %ecx, -5(%edx)
|
|
movzbl -1(%eax), %ecx
|
|
movb %cl, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_41bytes_align):
|
|
movdqa -41(%eax), %xmm0
|
|
movdqa %xmm0, -41(%edx)
|
|
L(fwd_write_25bytes_align):
|
|
movdqa -25(%eax), %xmm0
|
|
movdqa %xmm0, -25(%edx)
|
|
L(fwd_write_9bytes_align):
|
|
movq -9(%eax), %xmm0
|
|
movq %xmm0, -9(%edx)
|
|
L(fwd_write_1bytes_align):
|
|
movzbl -1(%eax), %ecx
|
|
movb %cl, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_33bytes_align):
|
|
movdqa -33(%eax), %xmm0
|
|
movdqa %xmm0, -33(%edx)
|
|
L(fwd_write_17bytes_align):
|
|
movdqa -17(%eax), %xmm0
|
|
movdqa %xmm0, -17(%edx)
|
|
movzbl -1(%eax), %ecx
|
|
movb %cl, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_46bytes_align):
|
|
movdqa -46(%eax), %xmm0
|
|
movdqa %xmm0, -46(%edx)
|
|
L(fwd_write_30bytes_align):
|
|
movdqa -30(%eax), %xmm0
|
|
movdqa %xmm0, -30(%edx)
|
|
L(fwd_write_14bytes_align):
|
|
movq -14(%eax), %xmm0
|
|
movq %xmm0, -14(%edx)
|
|
L(fwd_write_6bytes_align):
|
|
movl -6(%eax), %ecx
|
|
movl %ecx, -6(%edx)
|
|
movzwl -2(%eax), %ecx
|
|
movw %cx, -2(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_38bytes_align):
|
|
movdqa -38(%eax), %xmm0
|
|
movdqa %xmm0, -38(%edx)
|
|
L(fwd_write_22bytes_align):
|
|
movdqa -22(%eax), %xmm0
|
|
movdqa %xmm0, -22(%edx)
|
|
movl -6(%eax), %ecx
|
|
movl %ecx, -6(%edx)
|
|
movzwl -2(%eax), %ecx
|
|
movw %cx, -2(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_42bytes_align):
|
|
movdqa -42(%eax), %xmm0
|
|
movdqa %xmm0, -42(%edx)
|
|
L(fwd_write_26bytes_align):
|
|
movdqa -26(%eax), %xmm0
|
|
movdqa %xmm0, -26(%edx)
|
|
L(fwd_write_10bytes_align):
|
|
movq -10(%eax), %xmm0
|
|
movq %xmm0, -10(%edx)
|
|
L(fwd_write_2bytes_align):
|
|
movzwl -2(%eax), %ecx
|
|
movw %cx, -2(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_34bytes_align):
|
|
movdqa -34(%eax), %xmm0
|
|
movdqa %xmm0, -34(%edx)
|
|
L(fwd_write_18bytes_align):
|
|
movdqa -18(%eax), %xmm0
|
|
movdqa %xmm0, -18(%edx)
|
|
movzwl -2(%eax), %ecx
|
|
movw %cx, -2(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_47bytes_align):
|
|
movdqa -47(%eax), %xmm0
|
|
movdqa %xmm0, -47(%edx)
|
|
L(fwd_write_31bytes_align):
|
|
movdqa -31(%eax), %xmm0
|
|
movdqa %xmm0, -31(%edx)
|
|
L(fwd_write_15bytes_align):
|
|
movq -15(%eax), %xmm0
|
|
movq %xmm0, -15(%edx)
|
|
L(fwd_write_7bytes_align):
|
|
movl -7(%eax), %ecx
|
|
movl %ecx, -7(%edx)
|
|
movzwl -3(%eax), %ecx
|
|
movzbl -1(%eax), %eax
|
|
movw %cx, -3(%edx)
|
|
movb %al, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_39bytes_align):
|
|
movdqa -39(%eax), %xmm0
|
|
movdqa %xmm0, -39(%edx)
|
|
L(fwd_write_23bytes_align):
|
|
movdqa -23(%eax), %xmm0
|
|
movdqa %xmm0, -23(%edx)
|
|
movl -7(%eax), %ecx
|
|
movl %ecx, -7(%edx)
|
|
movzwl -3(%eax), %ecx
|
|
movzbl -1(%eax), %eax
|
|
movw %cx, -3(%edx)
|
|
movb %al, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_43bytes_align):
|
|
movdqa -43(%eax), %xmm0
|
|
movdqa %xmm0, -43(%edx)
|
|
L(fwd_write_27bytes_align):
|
|
movdqa -27(%eax), %xmm0
|
|
movdqa %xmm0, -27(%edx)
|
|
L(fwd_write_11bytes_align):
|
|
movq -11(%eax), %xmm0
|
|
movq %xmm0, -11(%edx)
|
|
L(fwd_write_3bytes_align):
|
|
movzwl -3(%eax), %ecx
|
|
movzbl -1(%eax), %eax
|
|
movw %cx, -3(%edx)
|
|
movb %al, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_35bytes_align):
|
|
movdqa -35(%eax), %xmm0
|
|
movdqa %xmm0, -35(%edx)
|
|
L(fwd_write_19bytes_align):
|
|
movdqa -19(%eax), %xmm0
|
|
movdqa %xmm0, -19(%edx)
|
|
movzwl -3(%eax), %ecx
|
|
movzbl -1(%eax), %eax
|
|
movw %cx, -3(%edx)
|
|
movb %al, -1(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_44bytes_align):
|
|
movdqa -44(%eax), %xmm0
|
|
movdqa %xmm0, -44(%edx)
|
|
L(fwd_write_28bytes_align):
|
|
movdqa -28(%eax), %xmm0
|
|
movdqa %xmm0, -28(%edx)
|
|
L(fwd_write_12bytes_align):
|
|
movq -12(%eax), %xmm0
|
|
movq %xmm0, -12(%edx)
|
|
L(fwd_write_4bytes_align):
|
|
movl -4(%eax), %ecx
|
|
movl %ecx, -4(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(fwd_write_36bytes_align):
|
|
movdqa -36(%eax), %xmm0
|
|
movdqa %xmm0, -36(%edx)
|
|
L(fwd_write_20bytes_align):
|
|
movdqa -20(%eax), %xmm0
|
|
movdqa %xmm0, -20(%edx)
|
|
movl -4(%eax), %ecx
|
|
movl %ecx, -4(%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl %edx, %eax
|
|
# else
|
|
movl DEST(%esp), %eax
|
|
# endif
|
|
# endif
|
|
RETURN_END
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(large_page):
|
|
movdqu (%eax), %xmm1
|
|
# ifdef USE_AS_MEMMOVE
|
|
movl DEST+4(%esp), %edi
|
|
movdqu %xmm0, (%edi)
|
|
# endif
|
|
lea 16(%eax), %eax
|
|
movntdq %xmm1, (%edx)
|
|
lea 16(%edx), %edx
|
|
lea -0x90(%ecx), %ecx
|
|
POP (%edi)
|
|
|
|
.p2align 4
|
|
L(large_page_loop):
|
|
movdqu (%eax), %xmm0
|
|
movdqu 0x10(%eax), %xmm1
|
|
movdqu 0x20(%eax), %xmm2
|
|
movdqu 0x30(%eax), %xmm3
|
|
movdqu 0x40(%eax), %xmm4
|
|
movdqu 0x50(%eax), %xmm5
|
|
movdqu 0x60(%eax), %xmm6
|
|
movdqu 0x70(%eax), %xmm7
|
|
lea 0x80(%eax), %eax
|
|
|
|
sub $0x80, %ecx
|
|
movntdq %xmm0, (%edx)
|
|
movntdq %xmm1, 0x10(%edx)
|
|
movntdq %xmm2, 0x20(%edx)
|
|
movntdq %xmm3, 0x30(%edx)
|
|
movntdq %xmm4, 0x40(%edx)
|
|
movntdq %xmm5, 0x50(%edx)
|
|
movntdq %xmm6, 0x60(%edx)
|
|
movntdq %xmm7, 0x70(%edx)
|
|
lea 0x80(%edx), %edx
|
|
jae L(large_page_loop)
|
|
cmp $-0x40, %ecx
|
|
lea 0x80(%ecx), %ecx
|
|
jl L(large_page_less_64bytes)
|
|
|
|
movdqu (%eax), %xmm0
|
|
movdqu 0x10(%eax), %xmm1
|
|
movdqu 0x20(%eax), %xmm2
|
|
movdqu 0x30(%eax), %xmm3
|
|
lea 0x40(%eax), %eax
|
|
|
|
movntdq %xmm0, (%edx)
|
|
movntdq %xmm1, 0x10(%edx)
|
|
movntdq %xmm2, 0x20(%edx)
|
|
movntdq %xmm3, 0x30(%edx)
|
|
lea 0x40(%edx), %edx
|
|
sub $0x40, %ecx
|
|
L(large_page_less_64bytes):
|
|
cmp $32, %ecx
|
|
jb L(large_page_less_32bytes)
|
|
movdqu (%eax), %xmm0
|
|
movdqu 0x10(%eax), %xmm1
|
|
lea 0x20(%eax), %eax
|
|
movntdq %xmm0, (%edx)
|
|
movntdq %xmm1, 0x10(%edx)
|
|
lea 0x20(%edx), %edx
|
|
sub $0x20, %ecx
|
|
L(large_page_less_32bytes):
|
|
add %ecx, %edx
|
|
add %ecx, %eax
|
|
sfence
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4)
|
|
|
|
.p2align 4
|
|
L(bk_write_44bytes):
|
|
movq 36(%eax), %xmm0
|
|
movq %xmm0, 36(%edx)
|
|
L(bk_write_36bytes):
|
|
movq 28(%eax), %xmm0
|
|
movq %xmm0, 28(%edx)
|
|
L(bk_write_28bytes):
|
|
movq 20(%eax), %xmm0
|
|
movq %xmm0, 20(%edx)
|
|
L(bk_write_20bytes):
|
|
movq 12(%eax), %xmm0
|
|
movq %xmm0, 12(%edx)
|
|
L(bk_write_12bytes):
|
|
movq 4(%eax), %xmm0
|
|
movq %xmm0, 4(%edx)
|
|
L(bk_write_4bytes):
|
|
movl (%eax), %ecx
|
|
movl %ecx, (%edx)
|
|
L(bk_write_0bytes):
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_40bytes):
|
|
movq 32(%eax), %xmm0
|
|
movq %xmm0, 32(%edx)
|
|
L(bk_write_32bytes):
|
|
movq 24(%eax), %xmm0
|
|
movq %xmm0, 24(%edx)
|
|
L(bk_write_24bytes):
|
|
movq 16(%eax), %xmm0
|
|
movq %xmm0, 16(%edx)
|
|
L(bk_write_16bytes):
|
|
movq 8(%eax), %xmm0
|
|
movq %xmm0, 8(%edx)
|
|
L(bk_write_8bytes):
|
|
movq (%eax), %xmm0
|
|
movq %xmm0, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_45bytes):
|
|
movq 37(%eax), %xmm0
|
|
movq %xmm0, 37(%edx)
|
|
L(bk_write_37bytes):
|
|
movq 29(%eax), %xmm0
|
|
movq %xmm0, 29(%edx)
|
|
L(bk_write_29bytes):
|
|
movq 21(%eax), %xmm0
|
|
movq %xmm0, 21(%edx)
|
|
L(bk_write_21bytes):
|
|
movq 13(%eax), %xmm0
|
|
movq %xmm0, 13(%edx)
|
|
L(bk_write_13bytes):
|
|
movq 5(%eax), %xmm0
|
|
movq %xmm0, 5(%edx)
|
|
L(bk_write_5bytes):
|
|
movl 1(%eax), %ecx
|
|
movl %ecx, 1(%edx)
|
|
L(bk_write_1bytes):
|
|
movzbl (%eax), %ecx
|
|
movb %cl, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_41bytes):
|
|
movq 33(%eax), %xmm0
|
|
movq %xmm0, 33(%edx)
|
|
L(bk_write_33bytes):
|
|
movq 25(%eax), %xmm0
|
|
movq %xmm0, 25(%edx)
|
|
L(bk_write_25bytes):
|
|
movq 17(%eax), %xmm0
|
|
movq %xmm0, 17(%edx)
|
|
L(bk_write_17bytes):
|
|
movq 9(%eax), %xmm0
|
|
movq %xmm0, 9(%edx)
|
|
L(bk_write_9bytes):
|
|
movq 1(%eax), %xmm0
|
|
movq %xmm0, 1(%edx)
|
|
movzbl (%eax), %ecx
|
|
movb %cl, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_46bytes):
|
|
movq 38(%eax), %xmm0
|
|
movq %xmm0, 38(%edx)
|
|
L(bk_write_38bytes):
|
|
movq 30(%eax), %xmm0
|
|
movq %xmm0, 30(%edx)
|
|
L(bk_write_30bytes):
|
|
movq 22(%eax), %xmm0
|
|
movq %xmm0, 22(%edx)
|
|
L(bk_write_22bytes):
|
|
movq 14(%eax), %xmm0
|
|
movq %xmm0, 14(%edx)
|
|
L(bk_write_14bytes):
|
|
movq 6(%eax), %xmm0
|
|
movq %xmm0, 6(%edx)
|
|
L(bk_write_6bytes):
|
|
movl 2(%eax), %ecx
|
|
movl %ecx, 2(%edx)
|
|
movzwl (%eax), %ecx
|
|
movw %cx, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_42bytes):
|
|
movq 34(%eax), %xmm0
|
|
movq %xmm0, 34(%edx)
|
|
L(bk_write_34bytes):
|
|
movq 26(%eax), %xmm0
|
|
movq %xmm0, 26(%edx)
|
|
L(bk_write_26bytes):
|
|
movq 18(%eax), %xmm0
|
|
movq %xmm0, 18(%edx)
|
|
L(bk_write_18bytes):
|
|
movq 10(%eax), %xmm0
|
|
movq %xmm0, 10(%edx)
|
|
L(bk_write_10bytes):
|
|
movq 2(%eax), %xmm0
|
|
movq %xmm0, 2(%edx)
|
|
L(bk_write_2bytes):
|
|
movzwl (%eax), %ecx
|
|
movw %cx, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_47bytes):
|
|
movq 39(%eax), %xmm0
|
|
movq %xmm0, 39(%edx)
|
|
L(bk_write_39bytes):
|
|
movq 31(%eax), %xmm0
|
|
movq %xmm0, 31(%edx)
|
|
L(bk_write_31bytes):
|
|
movq 23(%eax), %xmm0
|
|
movq %xmm0, 23(%edx)
|
|
L(bk_write_23bytes):
|
|
movq 15(%eax), %xmm0
|
|
movq %xmm0, 15(%edx)
|
|
L(bk_write_15bytes):
|
|
movq 7(%eax), %xmm0
|
|
movq %xmm0, 7(%edx)
|
|
L(bk_write_7bytes):
|
|
movl 3(%eax), %ecx
|
|
movl %ecx, 3(%edx)
|
|
movzwl 1(%eax), %ecx
|
|
movw %cx, 1(%edx)
|
|
movzbl (%eax), %eax
|
|
movb %al, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN
|
|
|
|
.p2align 4
|
|
L(bk_write_43bytes):
|
|
movq 35(%eax), %xmm0
|
|
movq %xmm0, 35(%edx)
|
|
L(bk_write_35bytes):
|
|
movq 27(%eax), %xmm0
|
|
movq %xmm0, 27(%edx)
|
|
L(bk_write_27bytes):
|
|
movq 19(%eax), %xmm0
|
|
movq %xmm0, 19(%edx)
|
|
L(bk_write_19bytes):
|
|
movq 11(%eax), %xmm0
|
|
movq %xmm0, 11(%edx)
|
|
L(bk_write_11bytes):
|
|
movq 3(%eax), %xmm0
|
|
movq %xmm0, 3(%edx)
|
|
L(bk_write_3bytes):
|
|
movzwl 1(%eax), %ecx
|
|
movw %cx, 1(%edx)
|
|
movzbl (%eax), %eax
|
|
movb %al, (%edx)
|
|
# ifndef USE_AS_BCOPY
|
|
movl DEST(%esp), %eax
|
|
# ifdef USE_AS_MEMPCPY
|
|
movl LEN(%esp), %ecx
|
|
add %ecx, %eax
|
|
# endif
|
|
# endif
|
|
RETURN_END
|
|
|
|
|
|
.pushsection .rodata.ssse3,"a",@progbits
|
|
.p2align 2
|
|
L(table_48bytes_fwd):
|
|
.int JMPTBL (L(fwd_write_0bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_1bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_2bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_3bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_4bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_5bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_6bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_7bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_8bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_9bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_10bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_11bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_12bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_13bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_14bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_15bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_16bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_17bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_18bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_19bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_20bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_21bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_22bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_23bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_24bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_25bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_26bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_27bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_28bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_29bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_30bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_31bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_32bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_33bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_34bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_35bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_36bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_37bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_38bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_39bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_40bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_41bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_42bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_43bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_44bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_45bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_46bytes), L(table_48bytes_fwd))
|
|
.int JMPTBL (L(fwd_write_47bytes), L(table_48bytes_fwd))
|
|
|
|
.p2align 2
|
|
L(table_48bytes_fwd_align):
|
|
.int JMPTBL (L(fwd_write_0bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_1bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_2bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_3bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_4bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_5bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_6bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_7bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_8bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_9bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_10bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_11bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_12bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_13bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_14bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_15bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_16bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_17bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_18bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_19bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_20bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_21bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_22bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_23bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_24bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_25bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_26bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_27bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_28bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_29bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_30bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_31bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_32bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_33bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_34bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_35bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_36bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_37bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_38bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_39bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_40bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_41bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_42bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_43bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_44bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_45bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_46bytes_align), L(table_48bytes_fwd_align))
|
|
.int JMPTBL (L(fwd_write_47bytes_align), L(table_48bytes_fwd_align))
|
|
|
|
.p2align 2
|
|
L(shl_table):
|
|
.int JMPTBL (L(shl_0), L(shl_table))
|
|
.int JMPTBL (L(shl_1), L(shl_table))
|
|
.int JMPTBL (L(shl_2), L(shl_table))
|
|
.int JMPTBL (L(shl_3), L(shl_table))
|
|
.int JMPTBL (L(shl_4), L(shl_table))
|
|
.int JMPTBL (L(shl_5), L(shl_table))
|
|
.int JMPTBL (L(shl_6), L(shl_table))
|
|
.int JMPTBL (L(shl_7), L(shl_table))
|
|
.int JMPTBL (L(shl_8), L(shl_table))
|
|
.int JMPTBL (L(shl_9), L(shl_table))
|
|
.int JMPTBL (L(shl_10), L(shl_table))
|
|
.int JMPTBL (L(shl_11), L(shl_table))
|
|
.int JMPTBL (L(shl_12), L(shl_table))
|
|
.int JMPTBL (L(shl_13), L(shl_table))
|
|
.int JMPTBL (L(shl_14), L(shl_table))
|
|
.int JMPTBL (L(shl_15), L(shl_table))
|
|
|
|
.p2align 2
|
|
L(table_48_bytes_bwd):
|
|
.int JMPTBL (L(bk_write_0bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_1bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_2bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_3bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_4bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_5bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_6bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_7bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_8bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_9bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_10bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_11bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_12bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_13bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_14bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_15bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_16bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_17bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_18bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_19bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_20bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_21bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_22bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_23bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_24bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_25bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_26bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_27bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_28bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_29bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_30bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_31bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_32bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_33bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_34bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_35bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_36bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_37bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_38bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_39bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_40bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_41bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_42bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_43bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_44bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_45bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_46bytes), L(table_48_bytes_bwd))
|
|
.int JMPTBL (L(bk_write_47bytes), L(table_48_bytes_bwd))
|
|
|
|
.popsection
|
|
|
|
# ifdef USE_AS_MEMMOVE
|
|
.p2align 4
|
|
L(copy_backward):
|
|
PUSH (%edi)
|
|
movl %eax, %edi
|
|
lea (%ecx,%edx,1),%edx
|
|
lea (%ecx,%edi,1),%edi
|
|
testl $0x3, %edx
|
|
jnz L(bk_align)
|
|
|
|
L(bk_aligned_4):
|
|
cmp $64, %ecx
|
|
jae L(bk_write_more64bytes)
|
|
|
|
L(bk_write_64bytesless):
|
|
cmp $32, %ecx
|
|
jb L(bk_write_less32bytes)
|
|
|
|
L(bk_write_more32bytes):
|
|
/* Copy 32 bytes at a time. */
|
|
sub $32, %ecx
|
|
movq -8(%edi), %xmm0
|
|
movq %xmm0, -8(%edx)
|
|
movq -16(%edi), %xmm0
|
|
movq %xmm0, -16(%edx)
|
|
movq -24(%edi), %xmm0
|
|
movq %xmm0, -24(%edx)
|
|
movq -32(%edi), %xmm0
|
|
movq %xmm0, -32(%edx)
|
|
sub $32, %edx
|
|
sub $32, %edi
|
|
|
|
L(bk_write_less32bytes):
|
|
movl %edi, %eax
|
|
sub %ecx, %edx
|
|
sub %ecx, %eax
|
|
POP (%edi)
|
|
L(bk_write_less32bytes_2):
|
|
BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4)
|
|
|
|
CFI_PUSH (%edi)
|
|
|
|
.p2align 4
|
|
L(bk_align):
|
|
cmp $8, %ecx
|
|
jbe L(bk_write_less32bytes)
|
|
testl $1, %edx
|
|
/* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0,
|
|
then (EDX & 2) must be != 0. */
|
|
jz L(bk_got2)
|
|
sub $1, %edi
|
|
sub $1, %ecx
|
|
sub $1, %edx
|
|
movzbl (%edi), %eax
|
|
movb %al, (%edx)
|
|
|
|
testl $2, %edx
|
|
jz L(bk_aligned_4)
|
|
|
|
L(bk_got2):
|
|
sub $2, %edi
|
|
sub $2, %ecx
|
|
sub $2, %edx
|
|
movzwl (%edi), %eax
|
|
movw %ax, (%edx)
|
|
jmp L(bk_aligned_4)
|
|
|
|
.p2align 4
|
|
L(bk_write_more64bytes):
|
|
/* Check alignment of last byte. */
|
|
testl $15, %edx
|
|
jz L(bk_ssse3_cpy_pre)
|
|
|
|
/* EDX is aligned 4 bytes, but not 16 bytes. */
|
|
L(bk_ssse3_align):
|
|
sub $4, %edi
|
|
sub $4, %ecx
|
|
sub $4, %edx
|
|
movl (%edi), %eax
|
|
movl %eax, (%edx)
|
|
|
|
testl $15, %edx
|
|
jz L(bk_ssse3_cpy_pre)
|
|
|
|
sub $4, %edi
|
|
sub $4, %ecx
|
|
sub $4, %edx
|
|
movl (%edi), %eax
|
|
movl %eax, (%edx)
|
|
|
|
testl $15, %edx
|
|
jz L(bk_ssse3_cpy_pre)
|
|
|
|
sub $4, %edi
|
|
sub $4, %ecx
|
|
sub $4, %edx
|
|
movl (%edi), %eax
|
|
movl %eax, (%edx)
|
|
|
|
L(bk_ssse3_cpy_pre):
|
|
cmp $64, %ecx
|
|
jb L(bk_write_more32bytes)
|
|
|
|
.p2align 4
|
|
L(bk_ssse3_cpy):
|
|
sub $64, %edi
|
|
sub $64, %ecx
|
|
sub $64, %edx
|
|
movdqu 0x30(%edi), %xmm3
|
|
movdqa %xmm3, 0x30(%edx)
|
|
movdqu 0x20(%edi), %xmm2
|
|
movdqa %xmm2, 0x20(%edx)
|
|
movdqu 0x10(%edi), %xmm1
|
|
movdqa %xmm1, 0x10(%edx)
|
|
movdqu (%edi), %xmm0
|
|
movdqa %xmm0, (%edx)
|
|
cmp $64, %ecx
|
|
jae L(bk_ssse3_cpy)
|
|
jmp L(bk_write_64bytesless)
|
|
|
|
# endif
|
|
|
|
END (MEMCPY)
|
|
|
|
#endif
|