powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove

POWER9 DD2.1 and earlier has an issue where some cache inhibited
vector load traps to the kernel, causing a performance degradation.  To
handle this in memcpy and memmove, lvx/stvx is used for aligned
addresses instead of lxvd2x/stxvd2x.

Reference: https://patchwork.ozlabs.org/patch/814059/

	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
	lxvd2x/stxvd2x with lvx/stvx.
	* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.

Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com>
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
This commit is contained in:
Rajalakshmi Srinivasaraghavan 2017-10-25 13:13:53 -02:00 committed by Tulio Magno Quites Machado Filho
parent a122dbfb2e
commit 63da5cd4a0
3 changed files with 102 additions and 96 deletions

View File

@ -1,3 +1,9 @@
2017-10-25 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
lxvd2x/stxvd2x with lvx/stvx.
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
2017-10-25 H.J. Lu <hongjiu.lu@intel.com>
* include/alloc_buffer.h: Replace "if if " with "if " in

View File

@ -91,63 +91,63 @@ L(aligned_copy):
srdi 12,cnt,7
cmpdi 12,0
beq L(aligned_tail)
lxvd2x 6,0,src
lxvd2x 7,src,6
lvx 6,0,src
lvx 7,src,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
lxvd2x 6,0,src
lxvd2x 7,src,6
lvx 6,0,src
lvx 7,src,6
L(aligned_128loop):
lxvd2x 8,src,7
lxvd2x 9,src,8
stxvd2x 6,0,dst
lvx 8,src,7
lvx 9,src,8
stvx 6,0,dst
addi src,src,64
stxvd2x 7,dst,6
stxvd2x 8,dst,7
stxvd2x 9,dst,8
lxvd2x 6,0,src
lxvd2x 7,src,6
stvx 7,dst,6
stvx 8,dst,7
stvx 9,dst,8
lvx 6,0,src
lvx 7,src,6
addi dst,dst,64
lxvd2x 8,src,7
lxvd2x 9,src,8
lvx 8,src,7
lvx 9,src,8
addi src,src,64
stxvd2x 6,0,dst
stxvd2x 7,dst,6
stxvd2x 8,dst,7
stxvd2x 9,dst,8
stvx 6,0,dst
stvx 7,dst,6
stvx 8,dst,7
stvx 9,dst,8
addi dst,dst,64
bdnz L(aligned_128head)
L(aligned_tail):
mtocrf 0x01,cnt
bf 25,32f
lxvd2x 6,0,src
lxvd2x 7,src,6
lxvd2x 8,src,7
lxvd2x 9,src,8
lvx 6,0,src
lvx 7,src,6
lvx 8,src,7
lvx 9,src,8
addi src,src,64
stxvd2x 6,0,dst
stxvd2x 7,dst,6
stxvd2x 8,dst,7
stxvd2x 9,dst,8
stvx 6,0,dst
stvx 7,dst,6
stvx 8,dst,7
stvx 9,dst,8
addi dst,dst,64
32:
bf 26,16f
lxvd2x 6,0,src
lxvd2x 7,src,6
lvx 6,0,src
lvx 7,src,6
addi src,src,32
stxvd2x 6,0,dst
stxvd2x 7,dst,6
stvx 6,0,dst
stvx 7,dst,6
addi dst,dst,32
16:
bf 27,8f
lxvd2x 6,0,src
lvx 6,0,src
addi src,src,16
stxvd2x 6,0,dst
stvx 6,0,dst
addi dst,dst,16
8:
bf 28,4f

View File

@ -92,63 +92,63 @@ L(aligned_copy):
srdi 12,r5,7
cmpdi 12,0
beq L(aligned_tail)
lxvd2x 6,0,r4
lxvd2x 7,r4,6
lvx 6,0,r4
lvx 7,r4,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
lxvd2x 6,0,r4
lxvd2x 7,r4,6
lvx 6,0,r4
lvx 7,r4,6
L(aligned_128loop):
lxvd2x 8,r4,7
lxvd2x 9,r4,8
stxvd2x 6,0,r11
lvx 8,r4,7
lvx 9,r4,8
stvx 6,0,r11
addi r4,r4,64
stxvd2x 7,r11,6
stxvd2x 8,r11,7
stxvd2x 9,r11,8
lxvd2x 6,0,r4
lxvd2x 7,r4,6
stvx 7,r11,6
stvx 8,r11,7
stvx 9,r11,8
lvx 6,0,r4
lvx 7,r4,6
addi r11,r11,64
lxvd2x 8,r4,7
lxvd2x 9,r4,8
lvx 8,r4,7
lvx 9,r4,8
addi r4,r4,64
stxvd2x 6,0,r11
stxvd2x 7,r11,6
stxvd2x 8,r11,7
stxvd2x 9,r11,8
stvx 6,0,r11
stvx 7,r11,6
stvx 8,r11,7
stvx 9,r11,8
addi r11,r11,64
bdnz L(aligned_128head)
L(aligned_tail):
mtocrf 0x01,r5
bf 25,32f
lxvd2x 6,0,r4
lxvd2x 7,r4,6
lxvd2x 8,r4,7
lxvd2x 9,r4,8
lvx 6,0,r4
lvx 7,r4,6
lvx 8,r4,7
lvx 9,r4,8
addi r4,r4,64
stxvd2x 6,0,r11
stxvd2x 7,r11,6
stxvd2x 8,r11,7
stxvd2x 9,r11,8
stvx 6,0,r11
stvx 7,r11,6
stvx 8,r11,7
stvx 9,r11,8
addi r11,r11,64
32:
bf 26,16f
lxvd2x 6,0,r4
lxvd2x 7,r4,6
lvx 6,0,r4
lvx 7,r4,6
addi r4,r4,32
stxvd2x 6,0,r11
stxvd2x 7,r11,6
stvx 6,0,r11
stvx 7,r11,6
addi r11,r11,32
16:
bf 27,8f
lxvd2x 6,0,r4
lvx 6,0,r4
addi r4,r4,16
stxvd2x 6,0,r11
stvx 6,0,r11
addi r11,r11,16
8:
bf 28,4f
@ -488,63 +488,63 @@ L(aligned_copy_bwd):
srdi r12,r5,7
cmpdi r12,0
beq L(aligned_tail_bwd)
lxvd2x v6,r4,r6
lxvd2x v7,r4,r7
lvx v6,r4,r6
lvx v7,r4,r7
mtctr 12
b L(aligned_128loop_bwd)
.align 4
L(aligned_128head_bwd):
/* for the 2nd + iteration of this loop. */
lxvd2x v6,r4,r6
lxvd2x v7,r4,r7
lvx v6,r4,r6
lvx v7,r4,r7
L(aligned_128loop_bwd):
lxvd2x v8,r4,r8
lxvd2x v9,r4,r9
stxvd2x v6,r11,r6
lvx v8,r4,r8
lvx v9,r4,r9
stvx v6,r11,r6
subi r4,r4,64
stxvd2x v7,r11,r7
stxvd2x v8,r11,r8
stxvd2x v9,r11,r9
lxvd2x v6,r4,r6
lxvd2x v7,r4,7
stvx v7,r11,r7
stvx v8,r11,r8
stvx v9,r11,r9
lvx v6,r4,r6
lvx v7,r4,7
subi r11,r11,64
lxvd2x v8,r4,r8
lxvd2x v9,r4,r9
lvx v8,r4,r8
lvx v9,r4,r9
subi r4,r4,64
stxvd2x v6,r11,r6
stxvd2x v7,r11,r7
stxvd2x v8,r11,r8
stxvd2x v9,r11,r9
stvx v6,r11,r6
stvx v7,r11,r7
stvx v8,r11,r8
stvx v9,r11,r9
subi r11,r11,64
bdnz L(aligned_128head_bwd)
L(aligned_tail_bwd):
mtocrf 0x01,r5
bf 25,32f
lxvd2x v6,r4,r6
lxvd2x v7,r4,r7
lxvd2x v8,r4,r8
lxvd2x v9,r4,r9
lvx v6,r4,r6
lvx v7,r4,r7
lvx v8,r4,r8
lvx v9,r4,r9
subi r4,r4,64
stxvd2x v6,r11,r6
stxvd2x v7,r11,r7
stxvd2x v8,r11,r8
stxvd2x v9,r11,r9
stvx v6,r11,r6
stvx v7,r11,r7
stvx v8,r11,r8
stvx v9,r11,r9
subi r11,r11,64
32:
bf 26,16f
lxvd2x v6,r4,r6
lxvd2x v7,r4,r7
lvx v6,r4,r6
lvx v7,r4,r7
subi r4,r4,32
stxvd2x v6,r11,r6
stxvd2x v7,r11,r7
stvx v6,r11,r6
stvx v7,r11,r7
subi r11,r11,32
16:
bf 27,8f
lxvd2x v6,r4,r6
lvx v6,r4,r6
subi r4,r4,16
stxvd2x v6,r11,r6
stvx v6,r11,r6
subi r11,r11,16
8:
bf 28,4f