powerpc: Replace lxvd2x/stxvd2x with lvx/stvx in P7's memcpy/memmove
POWER9 DD2.1 and earlier has an issue where some cache inhibited vector load traps to the kernel, causing a performance degradation. To handle this in memcpy and memmove, lvx/stvx is used for aligned addresses instead of lxvd2x/stxvd2x. Reference: https://patchwork.ozlabs.org/patch/814059/ * sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace lxvd2x/stxvd2x with lvx/stvx. * sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise. Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.vnet.ibm.com> Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
This commit is contained in:
parent
a122dbfb2e
commit
63da5cd4a0
@ -1,3 +1,9 @@
|
||||
2017-10-25 Rajalakshmi Srinivasaraghavan <raji@linux.vnet.ibm.com>
|
||||
|
||||
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Replace
|
||||
lxvd2x/stxvd2x with lvx/stvx.
|
||||
* sysdeps/powerpc/powerpc64/power7/memmove.S: Likewise.
|
||||
|
||||
2017-10-25 H.J. Lu <hongjiu.lu@intel.com>
|
||||
|
||||
* include/alloc_buffer.h: Replace "if if " with "if " in
|
||||
|
@ -91,63 +91,63 @@ L(aligned_copy):
|
||||
srdi 12,cnt,7
|
||||
cmpdi 12,0
|
||||
beq L(aligned_tail)
|
||||
lxvd2x 6,0,src
|
||||
lxvd2x 7,src,6
|
||||
lvx 6,0,src
|
||||
lvx 7,src,6
|
||||
mtctr 12
|
||||
b L(aligned_128loop)
|
||||
|
||||
.align 4
|
||||
L(aligned_128head):
|
||||
/* for the 2nd + iteration of this loop. */
|
||||
lxvd2x 6,0,src
|
||||
lxvd2x 7,src,6
|
||||
lvx 6,0,src
|
||||
lvx 7,src,6
|
||||
L(aligned_128loop):
|
||||
lxvd2x 8,src,7
|
||||
lxvd2x 9,src,8
|
||||
stxvd2x 6,0,dst
|
||||
lvx 8,src,7
|
||||
lvx 9,src,8
|
||||
stvx 6,0,dst
|
||||
addi src,src,64
|
||||
stxvd2x 7,dst,6
|
||||
stxvd2x 8,dst,7
|
||||
stxvd2x 9,dst,8
|
||||
lxvd2x 6,0,src
|
||||
lxvd2x 7,src,6
|
||||
stvx 7,dst,6
|
||||
stvx 8,dst,7
|
||||
stvx 9,dst,8
|
||||
lvx 6,0,src
|
||||
lvx 7,src,6
|
||||
addi dst,dst,64
|
||||
lxvd2x 8,src,7
|
||||
lxvd2x 9,src,8
|
||||
lvx 8,src,7
|
||||
lvx 9,src,8
|
||||
addi src,src,64
|
||||
stxvd2x 6,0,dst
|
||||
stxvd2x 7,dst,6
|
||||
stxvd2x 8,dst,7
|
||||
stxvd2x 9,dst,8
|
||||
stvx 6,0,dst
|
||||
stvx 7,dst,6
|
||||
stvx 8,dst,7
|
||||
stvx 9,dst,8
|
||||
addi dst,dst,64
|
||||
bdnz L(aligned_128head)
|
||||
|
||||
L(aligned_tail):
|
||||
mtocrf 0x01,cnt
|
||||
bf 25,32f
|
||||
lxvd2x 6,0,src
|
||||
lxvd2x 7,src,6
|
||||
lxvd2x 8,src,7
|
||||
lxvd2x 9,src,8
|
||||
lvx 6,0,src
|
||||
lvx 7,src,6
|
||||
lvx 8,src,7
|
||||
lvx 9,src,8
|
||||
addi src,src,64
|
||||
stxvd2x 6,0,dst
|
||||
stxvd2x 7,dst,6
|
||||
stxvd2x 8,dst,7
|
||||
stxvd2x 9,dst,8
|
||||
stvx 6,0,dst
|
||||
stvx 7,dst,6
|
||||
stvx 8,dst,7
|
||||
stvx 9,dst,8
|
||||
addi dst,dst,64
|
||||
32:
|
||||
bf 26,16f
|
||||
lxvd2x 6,0,src
|
||||
lxvd2x 7,src,6
|
||||
lvx 6,0,src
|
||||
lvx 7,src,6
|
||||
addi src,src,32
|
||||
stxvd2x 6,0,dst
|
||||
stxvd2x 7,dst,6
|
||||
stvx 6,0,dst
|
||||
stvx 7,dst,6
|
||||
addi dst,dst,32
|
||||
16:
|
||||
bf 27,8f
|
||||
lxvd2x 6,0,src
|
||||
lvx 6,0,src
|
||||
addi src,src,16
|
||||
stxvd2x 6,0,dst
|
||||
stvx 6,0,dst
|
||||
addi dst,dst,16
|
||||
8:
|
||||
bf 28,4f
|
||||
|
@ -92,63 +92,63 @@ L(aligned_copy):
|
||||
srdi 12,r5,7
|
||||
cmpdi 12,0
|
||||
beq L(aligned_tail)
|
||||
lxvd2x 6,0,r4
|
||||
lxvd2x 7,r4,6
|
||||
lvx 6,0,r4
|
||||
lvx 7,r4,6
|
||||
mtctr 12
|
||||
b L(aligned_128loop)
|
||||
|
||||
.align 4
|
||||
L(aligned_128head):
|
||||
/* for the 2nd + iteration of this loop. */
|
||||
lxvd2x 6,0,r4
|
||||
lxvd2x 7,r4,6
|
||||
lvx 6,0,r4
|
||||
lvx 7,r4,6
|
||||
L(aligned_128loop):
|
||||
lxvd2x 8,r4,7
|
||||
lxvd2x 9,r4,8
|
||||
stxvd2x 6,0,r11
|
||||
lvx 8,r4,7
|
||||
lvx 9,r4,8
|
||||
stvx 6,0,r11
|
||||
addi r4,r4,64
|
||||
stxvd2x 7,r11,6
|
||||
stxvd2x 8,r11,7
|
||||
stxvd2x 9,r11,8
|
||||
lxvd2x 6,0,r4
|
||||
lxvd2x 7,r4,6
|
||||
stvx 7,r11,6
|
||||
stvx 8,r11,7
|
||||
stvx 9,r11,8
|
||||
lvx 6,0,r4
|
||||
lvx 7,r4,6
|
||||
addi r11,r11,64
|
||||
lxvd2x 8,r4,7
|
||||
lxvd2x 9,r4,8
|
||||
lvx 8,r4,7
|
||||
lvx 9,r4,8
|
||||
addi r4,r4,64
|
||||
stxvd2x 6,0,r11
|
||||
stxvd2x 7,r11,6
|
||||
stxvd2x 8,r11,7
|
||||
stxvd2x 9,r11,8
|
||||
stvx 6,0,r11
|
||||
stvx 7,r11,6
|
||||
stvx 8,r11,7
|
||||
stvx 9,r11,8
|
||||
addi r11,r11,64
|
||||
bdnz L(aligned_128head)
|
||||
|
||||
L(aligned_tail):
|
||||
mtocrf 0x01,r5
|
||||
bf 25,32f
|
||||
lxvd2x 6,0,r4
|
||||
lxvd2x 7,r4,6
|
||||
lxvd2x 8,r4,7
|
||||
lxvd2x 9,r4,8
|
||||
lvx 6,0,r4
|
||||
lvx 7,r4,6
|
||||
lvx 8,r4,7
|
||||
lvx 9,r4,8
|
||||
addi r4,r4,64
|
||||
stxvd2x 6,0,r11
|
||||
stxvd2x 7,r11,6
|
||||
stxvd2x 8,r11,7
|
||||
stxvd2x 9,r11,8
|
||||
stvx 6,0,r11
|
||||
stvx 7,r11,6
|
||||
stvx 8,r11,7
|
||||
stvx 9,r11,8
|
||||
addi r11,r11,64
|
||||
32:
|
||||
bf 26,16f
|
||||
lxvd2x 6,0,r4
|
||||
lxvd2x 7,r4,6
|
||||
lvx 6,0,r4
|
||||
lvx 7,r4,6
|
||||
addi r4,r4,32
|
||||
stxvd2x 6,0,r11
|
||||
stxvd2x 7,r11,6
|
||||
stvx 6,0,r11
|
||||
stvx 7,r11,6
|
||||
addi r11,r11,32
|
||||
16:
|
||||
bf 27,8f
|
||||
lxvd2x 6,0,r4
|
||||
lvx 6,0,r4
|
||||
addi r4,r4,16
|
||||
stxvd2x 6,0,r11
|
||||
stvx 6,0,r11
|
||||
addi r11,r11,16
|
||||
8:
|
||||
bf 28,4f
|
||||
@ -488,63 +488,63 @@ L(aligned_copy_bwd):
|
||||
srdi r12,r5,7
|
||||
cmpdi r12,0
|
||||
beq L(aligned_tail_bwd)
|
||||
lxvd2x v6,r4,r6
|
||||
lxvd2x v7,r4,r7
|
||||
lvx v6,r4,r6
|
||||
lvx v7,r4,r7
|
||||
mtctr 12
|
||||
b L(aligned_128loop_bwd)
|
||||
|
||||
.align 4
|
||||
L(aligned_128head_bwd):
|
||||
/* for the 2nd + iteration of this loop. */
|
||||
lxvd2x v6,r4,r6
|
||||
lxvd2x v7,r4,r7
|
||||
lvx v6,r4,r6
|
||||
lvx v7,r4,r7
|
||||
L(aligned_128loop_bwd):
|
||||
lxvd2x v8,r4,r8
|
||||
lxvd2x v9,r4,r9
|
||||
stxvd2x v6,r11,r6
|
||||
lvx v8,r4,r8
|
||||
lvx v9,r4,r9
|
||||
stvx v6,r11,r6
|
||||
subi r4,r4,64
|
||||
stxvd2x v7,r11,r7
|
||||
stxvd2x v8,r11,r8
|
||||
stxvd2x v9,r11,r9
|
||||
lxvd2x v6,r4,r6
|
||||
lxvd2x v7,r4,7
|
||||
stvx v7,r11,r7
|
||||
stvx v8,r11,r8
|
||||
stvx v9,r11,r9
|
||||
lvx v6,r4,r6
|
||||
lvx v7,r4,7
|
||||
subi r11,r11,64
|
||||
lxvd2x v8,r4,r8
|
||||
lxvd2x v9,r4,r9
|
||||
lvx v8,r4,r8
|
||||
lvx v9,r4,r9
|
||||
subi r4,r4,64
|
||||
stxvd2x v6,r11,r6
|
||||
stxvd2x v7,r11,r7
|
||||
stxvd2x v8,r11,r8
|
||||
stxvd2x v9,r11,r9
|
||||
stvx v6,r11,r6
|
||||
stvx v7,r11,r7
|
||||
stvx v8,r11,r8
|
||||
stvx v9,r11,r9
|
||||
subi r11,r11,64
|
||||
bdnz L(aligned_128head_bwd)
|
||||
|
||||
L(aligned_tail_bwd):
|
||||
mtocrf 0x01,r5
|
||||
bf 25,32f
|
||||
lxvd2x v6,r4,r6
|
||||
lxvd2x v7,r4,r7
|
||||
lxvd2x v8,r4,r8
|
||||
lxvd2x v9,r4,r9
|
||||
lvx v6,r4,r6
|
||||
lvx v7,r4,r7
|
||||
lvx v8,r4,r8
|
||||
lvx v9,r4,r9
|
||||
subi r4,r4,64
|
||||
stxvd2x v6,r11,r6
|
||||
stxvd2x v7,r11,r7
|
||||
stxvd2x v8,r11,r8
|
||||
stxvd2x v9,r11,r9
|
||||
stvx v6,r11,r6
|
||||
stvx v7,r11,r7
|
||||
stvx v8,r11,r8
|
||||
stvx v9,r11,r9
|
||||
subi r11,r11,64
|
||||
32:
|
||||
bf 26,16f
|
||||
lxvd2x v6,r4,r6
|
||||
lxvd2x v7,r4,r7
|
||||
lvx v6,r4,r6
|
||||
lvx v7,r4,r7
|
||||
subi r4,r4,32
|
||||
stxvd2x v6,r11,r6
|
||||
stxvd2x v7,r11,r7
|
||||
stvx v6,r11,r6
|
||||
stvx v7,r11,r7
|
||||
subi r11,r11,32
|
||||
16:
|
||||
bf 27,8f
|
||||
lxvd2x v6,r4,r6
|
||||
lvx v6,r4,r6
|
||||
subi r4,r4,16
|
||||
stxvd2x v6,r11,r6
|
||||
stvx v6,r11,r6
|
||||
subi r11,r11,16
|
||||
8:
|
||||
bf 28,4f
|
||||
|
Loading…
x
Reference in New Issue
Block a user