x86_64: Replace AVX512F .byte sequences with instructions
Since binutils 2.25 or later is required to build glibc, we can replace AVX512F .byte sequences with AVX512F instructions. Tested on x86-64 and x32. There are no code differences in libmvec.so and libmvec.a. * sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F .byte sequences with AVX512F instructions. * sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise. * sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S: Likewise. * sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S: Likewise.
This commit is contained in:
parent
5a706f649d
commit
b9eaca8fa0
12
ChangeLog
12
ChangeLog
@ -1,3 +1,15 @@
|
||||
2017-08-23 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* sysdeps/x86_64/fpu/svml_d_sincos8_core.S: Replace AVX512F
|
||||
.byte sequences with AVX512F instructions.
|
||||
* sysdeps/x86_64/fpu/svml_d_wrapper_impl.h: Likewise.
|
||||
* sysdeps/x86_64/fpu/svml_s_sincosf16_core.S: Likewise.
|
||||
* sysdeps/x86_64/fpu/svml_s_wrapper_impl.h: Likewise.
|
||||
* sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S:
|
||||
Likewise.
|
||||
|
||||
2017-08-22 Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
Steve Ellcey <sellcey@cavium.com>
|
||||
|
||||
|
@ -599,24 +599,9 @@ libmvec_hidden_def(_ZGVeN8vl8l8_sincos_skx)
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $256, %rsp
|
||||
/* Encoding for vmovups %zmm1, 128(%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x4c
|
||||
.byte 0x24
|
||||
.byte 0x02
|
||||
vmovups %zmm1, 128(%rsp)
|
||||
lea (%rsp), %rdi
|
||||
/* Encoding for vmovups %zmm2, 192(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x57
|
||||
.byte 0x03
|
||||
vmovups %zmm2, 192(%rdi)
|
||||
lea 64(%rsp), %rsi
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
movq 128(%rsp), %rdx
|
||||
|
@ -510,40 +510,11 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $384, %rsp
|
||||
/* Encoding for vmovups %zmm1, 128(%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x4c
|
||||
.byte 0x24
|
||||
.byte 0x02
|
||||
vmovups %zmm1, 128(%rsp)
|
||||
lea (%rsp), %rdi
|
||||
/* Encoding for vmovups %zmm2, 192(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x57
|
||||
.byte 0x03
|
||||
/* Encoding for vmovups %zmm3, 256(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x5f
|
||||
.byte 0x04
|
||||
/* Encoding for vmovups %zmm4, 320(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x67
|
||||
.byte 0x05
|
||||
vmovups %zmm2, 192(%rdi)
|
||||
vmovups %zmm3, 256(%rdi)
|
||||
vmovups %zmm4, 320(%rdi)
|
||||
lea 64(%rsp), %rsi
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
movq 128(%rsp), %rdx
|
||||
@ -661,30 +632,8 @@ libmvec_hidden_def(_ZGVeN16vl4l4_sincosf_skx)
|
||||
leal -112(%rbp), %esi
|
||||
leal -176(%rbp), %edi
|
||||
subl $296, %esp
|
||||
/* Encoding for vmovdqa64 %zmm1, -240(%ebp). */
|
||||
.byte 0x67
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0xfd
|
||||
.byte 0x48
|
||||
.byte 0x7f
|
||||
.byte 0x8d
|
||||
.byte 0x10
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
/* Encoding for vmovdqa64 %zmm2, -304(%ebp). */
|
||||
.byte 0x67
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0xfd
|
||||
.byte 0x48
|
||||
.byte 0x7f
|
||||
.byte 0x95
|
||||
.byte 0xd0
|
||||
.byte 0xfe
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
vmovdqa64 %zmm1, -240(%ebp)
|
||||
vmovdqa64 %zmm2, -304(%ebp)
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
movl -240(%ebp), %eax
|
||||
vmovss -176(%ebp), %xmm0
|
||||
|
@ -35,32 +35,10 @@ END (_ZGVeN8vl8l8_sincos)
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $320, %rsp
|
||||
/* Encoding for vmovups %zmm0, 256(%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x44
|
||||
.byte 0x24
|
||||
.byte 0x04
|
||||
vmovups %zmm0, 256(%rsp)
|
||||
lea (%rsp), %rdi
|
||||
/* Encoding for vmovups %zmm1, 128(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x4f
|
||||
.byte 0x02
|
||||
/* Encoding for vmovups %zmm2, 192(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x57
|
||||
.byte 0x03
|
||||
vmovups %zmm1, 128(%rdi)
|
||||
vmovups %zmm2, 192(%rdi)
|
||||
lea 64(%rsp), %rsi
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovdqu 288(%rsp), %ymm0
|
||||
@ -142,18 +120,7 @@ END (_ZGVeN8vl8l8_sincos)
|
||||
subl $280, %esp
|
||||
vmovdqa %ymm1, -208(%ebp)
|
||||
vmovdqa %ymm2, -240(%ebp)
|
||||
/* Encoding for vmovapd %zmm0, -304(%ebp). */
|
||||
.byte 0x67
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0xfd
|
||||
.byte 0x48
|
||||
.byte 0x29
|
||||
.byte 0x85
|
||||
.byte 0xd0
|
||||
.byte 0xfe
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
vmovapd %zmm0, -304(%ebp)
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
leal 32(%r12), %esi
|
||||
vmovupd -272(%ebp), %ymm0
|
||||
|
@ -201,29 +201,14 @@
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $128, %rsp
|
||||
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x04
|
||||
.byte 0x24
|
||||
vmovups %zmm0, (%rsp)
|
||||
vmovupd (%rsp), %ymm0
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovupd %ymm0, 64(%rsp)
|
||||
vmovupd 32(%rsp), %ymm0
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovupd %ymm0, 96(%rsp)
|
||||
/* Below is encoding for vmovups 64(%rsp), %zmm0. */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x10
|
||||
.byte 0x44
|
||||
.byte 0x24
|
||||
.byte 0x01
|
||||
vmovups 64(%rsp), %zmm0
|
||||
movq %rbp, %rsp
|
||||
cfi_def_cfa_register (%rsp)
|
||||
popq %rbp
|
||||
@ -241,23 +226,8 @@
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $192, %rsp
|
||||
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x04
|
||||
.byte 0x24
|
||||
/* Below is encoding for vmovups %zmm1, 64(%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x4c
|
||||
.byte 0x24
|
||||
.byte 0x01
|
||||
vmovups %zmm0, (%rsp)
|
||||
vmovups %zmm1, 64(%rsp)
|
||||
vmovupd (%rsp), %ymm0
|
||||
vmovupd 64(%rsp), %ymm1
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
@ -266,15 +236,7 @@
|
||||
vmovupd 96(%rsp), %ymm1
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovupd %ymm0, 160(%rsp)
|
||||
/* Below is encoding for vmovups 128(%rsp), %zmm0. */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x10
|
||||
.byte 0x44
|
||||
.byte 0x24
|
||||
.byte 0x02
|
||||
vmovups 128(%rsp), %zmm0
|
||||
movq %rbp, %rsp
|
||||
cfi_def_cfa_register (%rsp)
|
||||
popq %rbp
|
||||
@ -299,14 +261,7 @@
|
||||
cfi_rel_offset (%r13, 0)
|
||||
subq $176, %rsp
|
||||
movq %rsi, %r13
|
||||
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x04
|
||||
.byte 0x24
|
||||
vmovups %zmm0, (%rsp)
|
||||
movq %rdi, %r12
|
||||
vmovupd (%rsp), %ymm0
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
|
@ -35,48 +35,12 @@ END (_ZGVeN16vl4l4_sincosf)
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $448, %rsp
|
||||
/* Encoding for vmovups %zmm0, 384(%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x44
|
||||
.byte 0x24
|
||||
.byte 0x06
|
||||
vmovups %zmm0, 384(%rsp)
|
||||
lea (%rsp), %rdi
|
||||
/* Encoding for vmovups %zmm1, 128(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x4f
|
||||
.byte 0x02
|
||||
/* Encoding for vmovups %zmm2, 192(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x57
|
||||
.byte 0x03
|
||||
/* Encoding for vmovups %zmm3, 256(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x5f
|
||||
.byte 0x04
|
||||
/* Encoding for vmovups %zmm4, 320(%rdi). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x67
|
||||
.byte 0x05
|
||||
vmovups %zmm1, 128(%rdi)
|
||||
vmovups %zmm2, 192(%rdi)
|
||||
vmovups %zmm3, 256(%rdi)
|
||||
vmovups %zmm4, 320(%rdi)
|
||||
lea 64(%rsp), %rsi
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovdqu 416(%rsp), %ymm0
|
||||
@ -204,42 +168,9 @@ END (_ZGVeN16vl4l4_sincosf)
|
||||
.cfi_escape 0x10,0x3,0x2,0x76,0x68
|
||||
movq %rdi, %rbx
|
||||
subl $344, %esp
|
||||
/* Encoding for vmovdqa64 %zmm1, -240(%ebp). */
|
||||
.byte 0x67
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0xfd
|
||||
.byte 0x48
|
||||
.byte 0x7f
|
||||
.byte 0x8d
|
||||
.byte 0x10
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
/* Encoding for vmovdqa64 %zmm2, -304(%ebp). */
|
||||
.byte 0x67
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0xfd
|
||||
.byte 0x48
|
||||
.byte 0x7f
|
||||
.byte 0x95
|
||||
.byte 0xd0
|
||||
.byte 0xfe
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
/* Encoding for vmovaps %zmm0, -368(%ebp). */
|
||||
.byte 0x67
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x29
|
||||
.byte 0x85
|
||||
.byte 0x90
|
||||
.byte 0xfe
|
||||
.byte 0xff
|
||||
.byte 0xff
|
||||
vmovdqa64 %zmm1, -240(%ebp)
|
||||
vmovdqa64 %zmm2, -304(%ebp)
|
||||
vmovaps %zmm0, -368(%ebp)
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
leal 32(%r12), %esi
|
||||
vmovups -336(%ebp), %ymm0
|
||||
|
@ -246,29 +246,14 @@
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $128, %rsp
|
||||
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x04
|
||||
.byte 0x24
|
||||
vmovups %zmm0, (%rsp)
|
||||
vmovupd (%rsp), %ymm0
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovupd %ymm0, 64(%rsp)
|
||||
vmovupd 32(%rsp), %ymm0
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovupd %ymm0, 96(%rsp)
|
||||
/* Below is encoding for vmovups 64(%rsp), %zmm0. */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x10
|
||||
.byte 0x44
|
||||
.byte 0x24
|
||||
.byte 0x01
|
||||
vmovups 64(%rsp), %zmm0
|
||||
movq %rbp, %rsp
|
||||
cfi_def_cfa_register (%rsp)
|
||||
popq %rbp
|
||||
@ -286,23 +271,8 @@
|
||||
cfi_def_cfa_register (%rbp)
|
||||
andq $-64, %rsp
|
||||
subq $192, %rsp
|
||||
/* Below is encoding for vmovups %zmm0, (%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x04
|
||||
.byte 0x24
|
||||
/* Below is encoding for vmovups %zmm1, 64(%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x11
|
||||
.byte 0x4c
|
||||
.byte 0x24
|
||||
.byte 0x01
|
||||
vmovups %zmm0, (%rsp)
|
||||
vmovups %zmm1, 64(%rsp)
|
||||
vmovups (%rsp), %ymm0
|
||||
vmovups 64(%rsp), %ymm1
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
@ -311,15 +281,7 @@
|
||||
vmovups 96(%rsp), %ymm1
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
vmovups %ymm0, 160(%rsp)
|
||||
/* Below is encoding for vmovups 128(%rsp), %zmm0. */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x10
|
||||
.byte 0x44
|
||||
.byte 0x24
|
||||
.byte 0x02
|
||||
vmovups 128(%rsp), %zmm0
|
||||
movq %rbp, %rsp
|
||||
cfi_def_cfa_register (%rsp)
|
||||
popq %rbp
|
||||
@ -340,14 +302,7 @@
|
||||
pushq %r13
|
||||
subq $176, %rsp
|
||||
movq %rsi, %r13
|
||||
/* Below is encoding for vmovaps %zmm0, (%rsp). */
|
||||
.byte 0x62
|
||||
.byte 0xf1
|
||||
.byte 0x7c
|
||||
.byte 0x48
|
||||
.byte 0x29
|
||||
.byte 0x04
|
||||
.byte 0x24
|
||||
vmovaps %zmm0, (%rsp)
|
||||
movq %rdi, %r12
|
||||
vmovaps (%rsp), %ymm0
|
||||
call HIDDEN_JUMPTARGET(\callee)
|
||||
|
Loading…
x
Reference in New Issue
Block a user