PowerPC LE memcpy

http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html

LIttle-endian support for memcpy.  I spent some time cleaning up the
64-bit power7 memcpy, in order to avoid the extra alignment traps
power7 takes for little-endian.  It probably would have been better
to copy the linux kernel version of memcpy.

	* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
	* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
	* sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
	* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
	* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
	* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
	* sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
	use of regs.  Use power7 mtocrf.  Tidy function tails.
This commit is contained in:
Alan Modra 2013-08-17 18:47:22 +09:30
parent fe6e95d717
commit 759cfef3ac
10 changed files with 937 additions and 406 deletions

View File

@ -1,3 +1,16 @@
2013-10-04 Alan Modra <amodra@gmail.com>
* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
* sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better
use of regs. Use power7 mtocrf. Tidy function tails.
2013-10-04 Alan Modra <amodra@gmail.com>
* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.

View File

@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
#ifdef __LITTLE_ENDIAN__
sth 7,0(3)
#else
sth 6,0(3)
#endif
b 7f
.align 4
3:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,24
stb 6,0(3)
sth 7,1(3)
#else
stb 7,0(3)
sth 6,1(3)
#endif
b 7f
.align 4
5:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,8
#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two words to copy, so copy them */
#ifdef __LITTLE_ENDIAN__
srw 0,6,10
slw 8,7,9
#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
#endif
or 0,0,8 /* or them to get word to store */
lwz 6,8(5) /* load the 3rd src word */
stw 0,0(4) /* store the 1st dst word */
#ifdef __LITTLE_ENDIAN__
srw 0,7,10
slw 8,6,9
#else
slw 0,7,10 /* now left align 2nd src word into R0 */
srw 8,6,9 /* shift 3rd src word to right align it in R8 */
#endif
or 0,0,8 /* or them to get word to store */
lwz 7,12(5)
stw 0,4(4) /* store the 2nd dst word */
@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
addi 5,5,16
bf 31,4f
/* there is a third word to copy, so copy it */
#ifdef __LITTLE_ENDIAN__
srw 0,6,10
slw 8,7,9
#else
slw 0,6,10 /* shift 3rd src word to left align it in R0 */
srw 8,7,9 /* shift 4th src word to right align it in R8 */
#endif
or 0,0,8 /* or them to get word to store */
stw 0,0(4) /* store 3rd dst word */
mr 6,7
@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
#ifdef __LITTLE_ENDIAN__
srw 0,6,10
slw 8,7,9
#else
slw 0,6,10 /* shift 1st src word to left align it in R0 */
srw 8,7,9 /* shift 2nd src word to right align it in R8 */
#endif
addi 5,5,8
or 0,0,8 /* or them to get word to store */
bf 31,4f
@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
.align 4
4:
/* copy 16 bytes at a time */
#ifdef __LITTLE_ENDIAN__
srw 0,6,10
slw 8,7,9
#else
slw 0,6,10
srw 8,7,9
#endif
or 0,0,8
lwz 6,0(5)
stw 0,0(4)
#ifdef __LITTLE_ENDIAN__
srw 0,7,10
slw 8,6,9
#else
slw 0,7,10
srw 8,6,9
#endif
or 0,0,8
lwz 7,4(5)
stw 0,4(4)
#ifdef __LITTLE_ENDIAN__
srw 0,6,10
slw 8,7,9
#else
slw 0,6,10
srw 8,7,9
#endif
or 0,0,8
lwz 6,8(5)
stw 0,8(4)
#ifdef __LITTLE_ENDIAN__
srw 0,7,10
slw 8,6,9
#else
slw 0,7,10
srw 8,6,9
#endif
or 0,0,8
lwz 7,12(5)
stw 0,12(4)
@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
bdnz+ 4b
8:
/* calculate and store the final word */
#ifdef __LITTLE_ENDIAN__
srw 0,6,10
slw 8,7,9
#else
slw 0,6,10
srw 8,7,9
#endif
or 0,0,8
stw 0,0(4)
3:

View File

@ -219,15 +219,28 @@ L(word_unaligned_short):
blt cr6,5f
srwi 7,6,16
bgt cr6,3f
#ifdef __LITTLE_ENDIAN__
sth 7,0(3)
#else
sth 6,0(3)
#endif
b 7f
.align 4
3:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,24
stb 6,0(3)
sth 7,1(3)
#else
stb 7,0(3)
sth 6,1(3)
#endif
b 7f
.align 4
5:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,8
#endif
stb 6,0(3)
7:
cmplwi cr1,10,16
@ -577,7 +590,11 @@ L(wdu1_32):
lwz 6,-1(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
#ifdef __LITTLE_ENDIAN__
srwi 6,6,8
#else
slwi 6,6,8
#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu1_32tail)
mtctr 8
@ -585,8 +602,12 @@ L(wdu1_32):
lwz 8,3(4)
lwz 7,4(4)
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,24,32
#else
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
#endif
b L(wdu1_loop32x)
.align 4
L(wdu1_loop32):
@ -595,8 +616,12 @@ L(wdu1_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,24,32
#else
/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
#endif
L(wdu1_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@ -613,7 +638,11 @@ L(wdu1_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
#ifdef __LITTLE_ENDIAN__
srwi 6,8,8
#else
slwi 6,8,8
#endif
bdnz+ L(wdu1_loop32)
stw 10,-8(3)
stw 11,-4(3)
@ -624,8 +653,12 @@ L(wdu1_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,3(4)
/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,24,32
#else
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
rlwimi 6,8,8,(32-8),31
#endif
b L(wdu_32tailx)
L(wdu2_32):
@ -633,7 +666,11 @@ L(wdu2_32):
lwz 6,-2(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
#ifdef __LITTLE_ENDIAN__
srwi 6,6,16
#else
slwi 6,6,16
#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu2_32tail)
mtctr 8
@ -641,8 +678,11 @@ L(wdu2_32):
lwz 8,2(4)
lwz 7,4(4)
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,16,32
#else
rlwimi 6,8,16,(32-16),31
#endif
b L(wdu2_loop32x)
.align 4
L(wdu2_loop32):
@ -651,8 +691,11 @@ L(wdu2_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,16,32
#else
rlwimi 6,8,16,(32-16),31
#endif
L(wdu2_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@ -670,7 +713,11 @@ L(wdu2_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
#ifdef __LITTLE_ENDIAN__
srwi 6,8,16
#else
slwi 6,8,16
#endif
bdnz+ L(wdu2_loop32)
stw 10,-8(3)
stw 11,-4(3)
@ -681,8 +728,11 @@ L(wdu2_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,2(4)
/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,16,32
#else
rlwimi 6,8,16,(32-16),31
#endif
b L(wdu_32tailx)
L(wdu3_32):
@ -690,7 +740,11 @@ L(wdu3_32):
lwz 6,-3(4)
cmplwi cr6,31,4
srwi 8,31,5 /* calculate the 32 byte loop count */
#ifdef __LITTLE_ENDIAN__
srwi 6,6,24
#else
slwi 6,6,24
#endif
clrlwi 31,31,27 /* The remaining bytes, < 32. */
blt cr5,L(wdu3_32tail)
mtctr 8
@ -698,8 +752,11 @@ L(wdu3_32):
lwz 8,1(4)
lwz 7,4(4)
/* Equivalent to: srwi 8,8,32-8; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,8,32
#else
rlwimi 6,8,24,(32-24),31
#endif
b L(wdu3_loop32x)
.align 4
L(wdu3_loop32):
@ -708,8 +765,11 @@ L(wdu3_loop32):
lwz 7,4(4)
stw 10,-8(3)
stw 11,-4(3)
/* Equivalent to srwi 8,8,32-8; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,8,32
#else
rlwimi 6,8,24,(32-24),31
#endif
L(wdu3_loop32x):
lwz 10,8(4)
lwz 11,12(4)
@ -726,7 +786,11 @@ L(wdu3_loop32x):
stw 6,16(3)
stw 7,20(3)
addi 3,3,32
#ifdef __LITTLE_ENDIAN__
srwi 6,8,24
#else
slwi 6,8,24
#endif
bdnz+ L(wdu3_loop32)
stw 10,-8(3)
stw 11,-4(3)
@ -737,8 +801,11 @@ L(wdu3_32tail):
blt cr6,L(wdu_4tail)
/* calculate and store the final word */
lwz 8,1(4)
/* Equivalent to: srwi 8,8,32-9; or 6,6,8 */
#ifdef __LITTLE_ENDIAN__
rldimi 6,8,8,32
#else
rlwimi 6,8,24,(32-24),31
#endif
b L(wdu_32tailx)
.align 4
L(wdu_32tailx):

View File

@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
/* SRC is not quadword aligned, get it aligned. */
/* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
#ifdef __LITTLE_ENDIAN__
lvsr 5,0,12
#else
lvsl 5,0,12
#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@ -461,11 +469,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
vperm 6,3,4,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr6. */
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
lvx 3,11,7 /* vr3 = r11+32. */
vperm 10,4,3,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr10. */
#ifdef __LITTLE_ENDIAN__
vperm 10,3,4,5
#else
vperm 10,4,3,5
#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6

View File

@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):
beq L(copy_GE_32_unaligned_cont)
/* SRC is not quadword aligned, get it aligned. */
/* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmplwi cr6,9,1
#ifdef __LITTLE_ENDIAN__
lvsr 5,0,12
#else
lvsl 5,0,12
#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@ -403,11 +411,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
vperm 6,3,4,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr6. */
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
lvx 3,11,7 /* vr3 = r11+32. */
vperm 10,4,3,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr10. */
#ifdef __LITTLE_ENDIAN__
vperm 10,3,4,5
#else
vperm 10,4,3,5
#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6

View File

@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
#ifdef __LITTLE_ENDIAN__
sth 7,0(3)
#else
sth 6,0(3)
#endif
b 7f
.align 4
3:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,24
stb 6,0(3)
sth 7,1(3)
#else
stb 7,0(3)
sth 6,1(3)
#endif
b 7f
.align 4
5:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,8
#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0)
ld 7,8(5)
subfic 9,10,64
beq 2f
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
#else
sld 0,6,10
#endif
cmpldi 11,1
mr 6,7
addi 4,4,-8
@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0)
b 1f
2: addi 5,5,8
.align 4
#ifdef __LITTLE_ENDIAN__
0: srd 0,6,10
sld 8,7,9
#else
0: sld 0,6,10
srd 8,7,9
#endif
cmpldi 11,2
ld 6,8(5)
or 0,0,8
addi 11,11,-2
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srd 0,7,10
1: sld 8,6,9
#else
sld 0,7,10
1: srd 8,6,9
#endif
or 0,0,8
beq 8f
ld 7,16(5)

View File

@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0)
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
#ifdef __LITTLE_ENDIAN__
sth 7,0(3)
#else
sth 6,0(3)
#endif
b 7f
.align 4
3:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,24
stb 6,0(3)
sth 7,1(3)
#else
stb 7,0(3)
sth 6,1(3)
#endif
b 7f
.align 4
5:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,8
#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0)
bf 30,1f
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
sld 8,7,9
#else
sld 0,6,10
srd 8,7,9
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srd 0,7,10
sld 8,6,9
#else
sld 0,7,10
srd 8,6,9
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0)
blt cr6,8f /* if total DWs = 3, then bypass loop */
bf 31,4f
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
sld 8,7,9
#else
sld 0,6,10
srd 8,7,9
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0)
b 4f
.align 4
1:
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
sld 8,7,9
#else
sld 0,6,10
srd 8,7,9
#endif
addi 5,5,16
or 0,0,8
bf 31,4f
@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0)
addi 4,4,8
.align 4
/* copy 32 bytes at a time */
4: sld 0,6,10
4:
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
sld 8,7,9
#else
sld 0,6,10
srd 8,7,9
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srd 0,7,10
sld 8,6,9
#else
sld 0,7,10
srd 8,6,9
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
sld 8,7,9
#else
sld 0,6,10
srd 8,7,9
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srd 0,7,10
sld 8,6,9
#else
sld 0,7,10
srd 8,6,9
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0)
.align 4
8:
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srd 0,6,10
sld 8,7,9
#else
sld 0,6,10
srd 8,7,9
#endif
or 0,0,8
std 0,0(4)
3:

View File

@ -400,15 +400,28 @@ L(das_tail2):
blt cr6,5f
srdi 7,6,16
bgt cr6,3f
#ifdef __LITTLE_ENDIAN__
sth 7,0(3)
#else
sth 6,0(3)
#endif
b 7f
.align 4
3:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,24
stb 6,0(3)
sth 7,1(3)
#else
stb 7,0(3)
sth 6,1(3)
#endif
b 7f
.align 4
5:
#ifdef __LITTLE_ENDIAN__
rotlwi 6,6,8
#endif
stb 6,0(3)
7:
cmpldi cr1,10,16
@ -595,13 +608,24 @@ L(du1_do):
bf 30,L(du1_1dw)
/* there are at least two DWs to copy */
/* FIXME: can combine last shift and "or" into "rldimi" */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 8
sldi 8,7, 64-8
#else
sldi 0,6, 8
srdi 8,7, 64-8
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 8
sldi 8,6, 64-8
#else
sldi 0,7, 8
srdi 8,6, 64-8
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -610,8 +634,13 @@ L(du1_do):
blt cr6,L(du1_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du1_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 8
sldi 8,7, 64-8
#else
sldi 0,6, 8
srdi 8,7, 64-8
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -622,8 +651,13 @@ L(du1_do):
b L(du1_loop)
.align 4
L(du1_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 8
sldi 8,7, 64-8
#else
sldi 0,6, 8
srdi 8,7, 64-8
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du1_loop)
@ -635,23 +669,43 @@ L(du1_1dw):
.align 4
/* copy 32 bytes at a time */
L(du1_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 8
sldi 8,7, 64-8
#else
sldi 0,6, 8
srdi 8,7, 64-8
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 8
sldi 8,6, 64-8
#else
sldi 0,7, 8
srdi 8,6, 64-8
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 8
sldi 8,7, 64-8
#else
sldi 0,6, 8
srdi 8,7, 64-8
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 8
sldi 8,6, 64-8
#else
sldi 0,7, 8
srdi 8,6, 64-8
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -661,8 +715,13 @@ L(du1_loop):
.align 4
L(du1_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 8
sldi 8,7, 64-8
#else
sldi 0,6, 8
srdi 8,7, 64-8
#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@ -672,13 +731,23 @@ L(du2_do):
bf 30,L(du2_1dw)
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 16
sldi 8,7, 64-16
#else
sldi 0,6, 16
srdi 8,7, 64-16
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 16
sldi 8,6, 64-16
#else
sldi 0,7, 16
srdi 8,6, 64-16
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -687,8 +756,13 @@ L(du2_do):
blt cr6,L(du2_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du2_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 16
sldi 8,7, 64-16
#else
sldi 0,6, 16
srdi 8,7, 64-16
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -699,8 +773,13 @@ L(du2_do):
b L(du2_loop)
.align 4
L(du2_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 16
sldi 8,7, 64-16
#else
sldi 0,6, 16
srdi 8,7, 64-16
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du2_loop)
@ -712,23 +791,43 @@ L(du2_1dw):
.align 4
/* copy 32 bytes at a time */
L(du2_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 16
sldi 8,7, 64-16
#else
sldi 0,6, 16
srdi 8,7, 64-16
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 16
sldi 8,6, 64-16
#else
sldi 0,7, 16
srdi 8,6, 64-16
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 16
sldi 8,7, 64-16
#else
sldi 0,6, 16
srdi 8,7, 64-16
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 16
sldi 8,6, 64-16
#else
sldi 0,7, 16
srdi 8,6, 64-16
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -738,8 +837,13 @@ L(du2_loop):
.align 4
L(du2_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 16
sldi 8,7, 64-16
#else
sldi 0,6, 16
srdi 8,7, 64-16
#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@ -749,13 +853,23 @@ L(du3_do):
bf 30,L(du3_1dw)
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 24
sldi 8,7, 64-24
#else
sldi 0,6, 24
srdi 8,7, 64-24
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 24
sldi 8,6, 64-24
#else
sldi 0,7, 24
srdi 8,6, 64-24
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -764,8 +878,13 @@ L(du3_do):
blt cr6,L(du3_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du3_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 24
sldi 8,7, 64-24
#else
sldi 0,6, 24
srdi 8,7, 64-24
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -776,8 +895,13 @@ L(du3_do):
b L(du3_loop)
.align 4
L(du3_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 24
sldi 8,7, 64-24
#else
sldi 0,6, 24
srdi 8,7, 64-24
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du3_loop)
@ -789,23 +913,43 @@ L(du3_1dw):
.align 4
/* copy 32 bytes at a time */
L(du3_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 24
sldi 8,7, 64-24
#else
sldi 0,6, 24
srdi 8,7, 64-24
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 24
sldi 8,6, 64-24
#else
sldi 0,7, 24
srdi 8,6, 64-24
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 24
sldi 8,7, 64-24
#else
sldi 0,6, 24
srdi 8,7, 64-24
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 24
sldi 8,6, 64-24
#else
sldi 0,7, 24
srdi 8,6, 64-24
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -815,8 +959,13 @@ L(du3_loop):
.align 4
L(du3_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 24
sldi 8,7, 64-24
#else
sldi 0,6, 24
srdi 8,7, 64-24
#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@ -832,13 +981,23 @@ L(du4_dox):
bf 30,L(du4_1dw)
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 32
sldi 8,7, 64-32
#else
sldi 0,6, 32
srdi 8,7, 64-32
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 32
sldi 8,6, 64-32
#else
sldi 0,7, 32
srdi 8,6, 64-32
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -847,8 +1006,13 @@ L(du4_dox):
blt cr6,L(du4_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du4_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 32
sldi 8,7, 64-32
#else
sldi 0,6, 32
srdi 8,7, 64-32
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -859,8 +1023,13 @@ L(du4_dox):
b L(du4_loop)
.align 4
L(du4_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 32
sldi 8,7, 64-32
#else
sldi 0,6, 32
srdi 8,7, 64-32
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du4_loop)
@ -872,23 +1041,43 @@ L(du4_1dw):
.align 4
/* copy 32 bytes at a time */
L(du4_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 32
sldi 8,7, 64-32
#else
sldi 0,6, 32
srdi 8,7, 64-32
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 32
sldi 8,6, 64-32
#else
sldi 0,7, 32
srdi 8,6, 64-32
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 32
sldi 8,7, 64-32
#else
sldi 0,6, 32
srdi 8,7, 64-32
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 32
sldi 8,6, 64-32
#else
sldi 0,7, 32
srdi 8,6, 64-32
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -898,8 +1087,13 @@ L(du4_loop):
.align 4
L(du4_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 32
sldi 8,7, 64-32
#else
sldi 0,6, 32
srdi 8,7, 64-32
#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@ -909,13 +1103,23 @@ L(du5_do):
bf 30,L(du5_1dw)
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 40
sldi 8,7, 64-40
#else
sldi 0,6, 40
srdi 8,7, 64-40
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 40
sldi 8,6, 64-40
#else
sldi 0,7, 40
srdi 8,6, 64-40
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -924,8 +1128,13 @@ L(du5_do):
blt cr6,L(du5_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du5_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 40
sldi 8,7, 64-40
#else
sldi 0,6, 40
srdi 8,7, 64-40
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -936,8 +1145,13 @@ L(du5_do):
b L(du5_loop)
.align 4
L(du5_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 40
sldi 8,7, 64-40
#else
sldi 0,6, 40
srdi 8,7, 64-40
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du5_loop)
@ -949,23 +1163,43 @@ L(du5_1dw):
.align 4
/* copy 32 bytes at a time */
L(du5_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 40
sldi 8,7, 64-40
#else
sldi 0,6, 40
srdi 8,7, 64-40
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 40
sldi 8,6, 64-40
#else
sldi 0,7, 40
srdi 8,6, 64-40
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 40
sldi 8,7, 64-40
#else
sldi 0,6, 40
srdi 8,7, 64-40
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 40
sldi 8,6, 64-40
#else
sldi 0,7, 40
srdi 8,6, 64-40
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -975,8 +1209,13 @@ L(du5_loop):
.align 4
L(du5_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 40
sldi 8,7, 64-40
#else
sldi 0,6, 40
srdi 8,7, 64-40
#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@ -986,13 +1225,23 @@ L(du6_do):
bf 30,L(du6_1dw)
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 48
sldi 8,7, 64-48
#else
sldi 0,6, 48
srdi 8,7, 64-48
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 48
sldi 8,6, 64-48
#else
sldi 0,7, 48
srdi 8,6, 64-48
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -1001,8 +1250,13 @@ L(du6_do):
blt cr6,L(du6_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du6_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 48
sldi 8,7, 64-48
#else
sldi 0,6, 48
srdi 8,7, 64-48
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -1013,8 +1267,13 @@ L(du6_do):
b L(du6_loop)
.align 4
L(du6_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 48
sldi 8,7, 64-48
#else
sldi 0,6, 48
srdi 8,7, 64-48
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du6_loop)
@ -1026,23 +1285,43 @@ L(du6_1dw):
.align 4
/* copy 32 bytes at a time */
L(du6_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 48
sldi 8,7, 64-48
#else
sldi 0,6, 48
srdi 8,7, 64-48
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 48
sldi 8,6, 64-48
#else
sldi 0,7, 48
srdi 8,6, 64-48
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 48
sldi 8,7, 64-48
#else
sldi 0,6, 48
srdi 8,7, 64-48
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 48
sldi 8,6, 64-48
#else
sldi 0,7, 48
srdi 8,6, 64-48
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -1052,8 +1331,13 @@ L(du6_loop):
.align 4
L(du6_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 48
sldi 8,7, 64-48
#else
sldi 0,6, 48
srdi 8,7, 64-48
#endif
or 0,0,8
std 0,0(4)
b L(du_done)
@ -1063,13 +1347,23 @@ L(du7_do):
bf 30,L(du7_1dw)
/* there are at least two DWs to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 56
sldi 8,7, 64-56
#else
sldi 0,6, 56
srdi 8,7, 64-56
#endif
or 0,0,8
ld 6,16(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 56
sldi 8,6, 64-56
#else
sldi 0,7, 56
srdi 8,6, 64-56
#endif
or 0,0,8
ld 7,24(5)
std 0,8(4)
@ -1078,8 +1372,13 @@ L(du7_do):
blt cr6,L(du7_fini) /* if total DWs = 3, then bypass loop */
bf 31,L(du7_loop)
/* there is a third DW to copy */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 56
sldi 8,7, 64-56
#else
sldi 0,6, 56
srdi 8,7, 64-56
#endif
or 0,0,8
std 0,0(4)
mr 6,7
@ -1090,8 +1389,13 @@ L(du7_do):
b L(du7_loop)
.align 4
L(du7_1dw):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 56
sldi 8,7, 64-56
#else
sldi 0,6, 56
srdi 8,7, 64-56
#endif
addi 5,5,16
or 0,0,8
bf 31,L(du7_loop)
@ -1103,23 +1407,43 @@ L(du7_1dw):
.align 4
/* copy 32 bytes at a time */
L(du7_loop):
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 56
sldi 8,7, 64-56
#else
sldi 0,6, 56
srdi 8,7, 64-56
#endif
or 0,0,8
ld 6,0(5)
std 0,0(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 56
sldi 8,6, 64-56
#else
sldi 0,7, 56
srdi 8,6, 64-56
#endif
or 0,0,8
ld 7,8(5)
std 0,8(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 56
sldi 8,7, 64-56
#else
sldi 0,6, 56
srdi 8,7, 64-56
#endif
or 0,0,8
ld 6,16(5)
std 0,16(4)
#ifdef __LITTLE_ENDIAN__
srdi 0,7, 56
sldi 8,6, 64-56
#else
sldi 0,7, 56
srdi 8,6, 64-56
#endif
or 0,0,8
ld 7,24(5)
std 0,24(4)
@ -1129,8 +1453,13 @@ L(du7_loop):
.align 4
L(du7_fini):
/* calculate and store the final DW */
#ifdef __LITTLE_ENDIAN__
srdi 0,6, 56
sldi 8,7, 64-56
#else
sldi 0,6, 56
srdi 8,7, 64-56
#endif
or 0,0,8
std 0,0(4)
b L(du_done)

View File

@ -23,411 +23,354 @@
/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
Returns 'dst'. */
#define dst 11 /* Use r11 so r3 kept unchanged. */
#define src 4
#define cnt 5
.machine power7
EALIGN (memcpy, 5, 0)
CALL_MCOUNT 3
cmpldi cr1,5,31
cmpldi cr1,cnt,31
neg 0,3
std 3,-16(1)
std 31,-8(1)
cfi_offset(31,-8)
ble cr1, L(copy_LT_32) /* If move < 32 bytes use short move
code. */
andi. 11,3,7 /* Check alignment of DST. */
clrldi 10,4,61 /* Check alignment of SRC. */
#ifdef __LITTLE_ENDIAN__
/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
loop is only used for quadword aligned copies. */
andi. 10,3,15
clrldi 11,4,60
#else
andi. 10,3,7 /* Check alignment of DST. */
clrldi 11,4,61 /* Check alignment of SRC. */
#endif
cmpld cr6,10,11 /* SRC and DST alignments match? */
mr 12,4
mr 31,5
mr dst,3
bne cr6,L(copy_GE_32_unaligned)
beq L(aligned_copy)
srdi 9,5,3 /* Number of full quadwords remaining. */
beq L(copy_GE_32_aligned_cont)
mtocrf 0x01,0
#ifdef __LITTLE_ENDIAN__
clrldi 0,0,60
#else
clrldi 0,0,61
mtcrf 0x01,0
subf 31,0,5
#endif
/* Get the SRC aligned to 8 bytes. */
1: bf 31,2f
lbz 6,0(12)
addi 12,12,1
stb 6,0(3)
addi 3,3,1
2: bf 30,4f
lhz 6,0(12)
addi 12,12,2
sth 6,0(3)
addi 3,3,2
4: bf 29,0f
lwz 6,0(12)
addi 12,12,4
stw 6,0(3)
addi 3,3,4
0:
clrldi 10,12,61 /* Check alignment of SRC again. */
srdi 9,31,3 /* Number of full doublewords remaining. */
L(copy_GE_32_aligned_cont):
clrldi 11,31,61
mtcrf 0x01,9
srdi 8,31,5
cmpldi cr1,9,4
cmpldi cr6,11,0
mr 11,12
/* Copy 1~3 doublewords so the main loop starts
at a multiple of 32 bytes. */
bf 30,1f
ld 6,0(12)
ld 7,8(12)
addi 11,12,16
mtctr 8
std 6,0(3)
std 7,8(3)
addi 10,3,16
bf 31,4f
ld 0,16(12)
std 0,16(3)
blt cr1,3f
addi 11,12,24
addi 10,3,24
b 4f
.align 4
1: /* Copy 1 doubleword and set the counter. */
mr 10,3
mtctr 8
bf 31,4f
ld 6,0(12)
addi 11,12,8
std 6,0(3)
addi 10,3,8
L(aligned_copy):
/* Main aligned copy loop. Copies up to 128-bytes at a time. */
.align 4
/* Get the DST and SRC aligned to 8 bytes (16 for little-endian). */
1:
bf 31,2f
lbz 6,0(src)
addi src,src,1
stb 6,0(dst)
addi dst,dst,1
2:
bf 30,4f
lhz 6,0(src)
addi src,src,2
sth 6,0(dst)
addi dst,dst,2
4:
/* check for any 32-byte or 64-byte lumps that are outside of a
nice 128-byte range. R8 contains the number of 32-byte
lumps, so drop this into the CR, and use the SO/EQ bits to help
handle the 32- or 64- byte lumps. Then handle the rest with an
unrolled 128-bytes-at-a-time copy loop. */
mtocrf 1,8
li 6,16 # 16() index
li 7,32 # 32() index
li 8,48 # 48() index
bf 29,8f
lwz 6,0(src)
addi src,src,4
stw 6,0(dst)
addi dst,dst,4
8:
#ifdef __LITTLE_ENDIAN__
bf 28,16f
ld 6,0(src)
addi src,src,8
std 6,0(dst)
addi dst,dst,8
16:
#endif
subf cnt,0,cnt
L(aligned_32byte):
/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
bns cr7,L(aligned_64byte)
lxvd2x 6,0,11
lxvd2x 7,11,6
addi 11,11,32
stxvd2x 6,0,10
stxvd2x 7,10,6
addi 10,10,32
L(aligned_64byte):
/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
bne cr7,L(aligned_128setup)
lxvd2x 6,0,11
lxvd2x 7,11,6
lxvd2x 8,11,7
lxvd2x 9,11,8
addi 11,11,64
stxvd2x 6,0,10
stxvd2x 7,10,6
stxvd2x 8,10,7
stxvd2x 9,10,8
addi 10,10,64
L(aligned_128setup):
/* Set up for the 128-byte at a time copy loop. */
srdi 8,31,7
cmpdi 8,0 # Any 4x lumps left?
beq 3f # if not, move along.
lxvd2x 6,0,11
lxvd2x 7,11,6
mtctr 8 # otherwise, load the ctr and begin.
li 8,48 # 48() index
/* Main aligned copy loop. Copies 128 bytes at a time. */
L(aligned_copy):
li 6,16
li 7,32
li 8,48
mtocrf 0x02,cnt
srdi 12,cnt,7
cmpdi 12,0
beq L(aligned_tail)
lxvd2x 6,0,src
lxvd2x 7,src,6
mtctr 12
b L(aligned_128loop)
.align 4
L(aligned_128head):
/* for the 2nd + iteration of this loop. */
lxvd2x 6,0,11
lxvd2x 7,11,6
lxvd2x 6,0,src
lxvd2x 7,src,6
L(aligned_128loop):
lxvd2x 8,11,7
lxvd2x 9,11,8
stxvd2x 6,0,10
addi 11,11,64
stxvd2x 7,10,6
stxvd2x 8,10,7
stxvd2x 9,10,8
lxvd2x 6,0,11
lxvd2x 7,11,6
addi 10,10,64
lxvd2x 8,11,7
lxvd2x 9,11,8
addi 11,11,64
stxvd2x 6,0,10
stxvd2x 7,10,6
stxvd2x 8,10,7
stxvd2x 9,10,8
addi 10,10,64
lxvd2x 8,src,7
lxvd2x 9,src,8
stxvd2x 6,0,dst
addi src,src,64
stxvd2x 7,dst,6
stxvd2x 8,dst,7
stxvd2x 9,dst,8
lxvd2x 6,0,src
lxvd2x 7,src,6
addi dst,dst,64
lxvd2x 8,src,7
lxvd2x 9,src,8
addi src,src,64
stxvd2x 6,0,dst
stxvd2x 7,dst,6
stxvd2x 8,dst,7
stxvd2x 9,dst,8
addi dst,dst,64
bdnz L(aligned_128head)
3:
/* Check for tail bytes. */
rldicr 0,31,0,60
mtcrf 0x01,31
beq cr6,0f
.L9:
add 3,3,0
add 12,12,0
/* At this point we have a tail of 0-7 bytes and we know that the
destination is doubleword-aligned. */
4: /* Copy 4 bytes. */
bf 29,2f
lwz 6,0(12)
addi 12,12,4
stw 6,0(3)
addi 3,3,4
2: /* Copy 2 bytes. */
bf 30,1f
lhz 6,0(12)
addi 12,12,2
sth 6,0(3)
addi 3,3,2
1: /* Copy 1 byte. */
bf 31,0f
lbz 6,0(12)
stb 6,0(3)
0: /* Return original DST pointer. */
ld 31,-8(1)
ld 3,-16(1)
L(aligned_tail):
mtocrf 0x01,cnt
bf 25,32f
lxvd2x 6,0,src
lxvd2x 7,src,6
lxvd2x 8,src,7
lxvd2x 9,src,8
addi src,src,64
stxvd2x 6,0,dst
stxvd2x 7,dst,6
stxvd2x 8,dst,7
stxvd2x 9,dst,8
addi dst,dst,64
32:
bf 26,16f
lxvd2x 6,0,src
lxvd2x 7,src,6
addi src,src,32
stxvd2x 6,0,dst
stxvd2x 7,dst,6
addi dst,dst,32
16:
bf 27,8f
lxvd2x 6,0,src
addi src,src,16
stxvd2x 6,0,dst
addi dst,dst,16
8:
bf 28,4f
ld 6,0(src)
addi src,src,8
std 6,0(dst)
addi dst,dst,8
4: /* Copies 4~7 bytes. */
bf 29,L(tail2)
lwz 6,0(src)
stw 6,0(dst)
bf 30,L(tail5)
lhz 7,4(src)
sth 7,4(dst)
bflr 31
lbz 8,6(src)
stb 8,6(dst)
/* Return original DST pointer. */
blr
/* Handle copies of 0~31 bytes. */
/* Handle copies of 0~31 bytes. */
.align 4
L(copy_LT_32):
cmpldi cr6,5,8
mr 12,4
mtcrf 0x01,5
mr dst,3
cmpldi cr6,cnt,8
mtocrf 0x01,cnt
ble cr6,L(copy_LE_8)
/* At least 9 bytes to go. */
neg 8,4
clrrdi 11,4,2
andi. 0,8,3
cmpldi cr1,5,16
mr 10,5
cmpldi cr1,cnt,16
beq L(copy_LT_32_aligned)
/* Force 4-bytes alignment for SRC. */
/* Force 4-byte alignment for SRC. */
mtocrf 0x01,0
subf 10,0,5
2: bf 30,1f
lhz 6,0(12)
addi 12,12,2
sth 6,0(3)
addi 3,3,2
1: bf 31,L(end_4bytes_alignment)
lbz 6,0(12)
addi 12,12,1
stb 6,0(3)
addi 3,3,1
subf cnt,0,cnt
2:
bf 30,1f
lhz 6,0(src)
addi src,src,2
sth 6,0(dst)
addi dst,dst,2
1:
bf 31,L(end_4bytes_alignment)
lbz 6,0(src)
addi src,src,1
stb 6,0(dst)
addi dst,dst,1
.align 4
L(end_4bytes_alignment):
cmpldi cr1,10,16
mtcrf 0x01,10
cmpldi cr1,cnt,16
mtocrf 0x01,cnt
L(copy_LT_32_aligned):
/* At least 6 bytes to go, and SRC is word-aligned. */
blt cr1,8f
/* Copy 16 bytes. */
lwz 6,0(12)
lwz 7,4(12)
stw 6,0(3)
lwz 8,8(12)
stw 7,4(3)
lwz 6,12(12)
addi 12,12,16
stw 8,8(3)
stw 6,12(3)
addi 3,3,16
lwz 6,0(src)
lwz 7,4(src)
stw 6,0(dst)
lwz 8,8(src)
stw 7,4(dst)
lwz 6,12(src)
addi src,src,16
stw 8,8(dst)
stw 6,12(dst)
addi dst,dst,16
8: /* Copy 8 bytes. */
bf 28,4f
bf 28,L(tail4)
lwz 6,0(src)
lwz 7,4(src)
addi src,src,8
stw 6,0(dst)
stw 7,4(dst)
addi dst,dst,8
lwz 6,0(12)
lwz 7,4(12)
addi 12,12,8
stw 6,0(3)
stw 7,4(3)
addi 3,3,8
4: /* Copy 4 bytes. */
bf 29,2f
lwz 6,0(12)
addi 12,12,4
stw 6,0(3)
addi 3,3,4
2: /* Copy 2-3 bytes. */
bf 30,1f
lhz 6,0(12)
sth 6,0(3)
bf 31,0f
lbz 7,2(12)
stb 7,2(3)
ld 3,-16(1)
.align 4
/* Copies 4~7 bytes. */
L(tail4):
bf 29,L(tail2)
lwz 6,0(src)
stw 6,0(dst)
bf 30,L(tail5)
lhz 7,4(src)
sth 7,4(dst)
bflr 31
lbz 8,6(src)
stb 8,6(dst)
/* Return original DST pointer. */
blr
.align 4
1: /* Copy 1 byte. */
bf 31,0f
lbz 6,0(12)
stb 6,0(3)
0: /* Return original DST pointer. */
ld 3,-16(1)
/* Copies 2~3 bytes. */
L(tail2):
bf 30,1f
lhz 6,0(src)
sth 6,0(dst)
bflr 31
lbz 7,2(src)
stb 7,2(dst)
blr
/* Handles copies of 0~8 bytes. */
.align 4
L(tail5):
bflr 31
lbz 6,4(src)
stb 6,4(dst)
blr
.align 4
1:
bflr 31
lbz 6,0(src)
stb 6,0(dst)
/* Return original DST pointer. */
blr
/* Handles copies of 0~8 bytes. */
.align 4
L(copy_LE_8):
bne cr6,4f
bne cr6,L(tail4)
/* Though we could've used ld/std here, they are still
slow for unaligned cases. */
lwz 6,0(4)
lwz 7,4(4)
stw 6,0(3)
stw 7,4(3)
ld 3,-16(1) /* Return original DST pointers. */
lwz 6,0(src)
lwz 7,4(src)
stw 6,0(dst)
stw 7,4(dst)
blr
.align 4
4: /* Copies 4~7 bytes. */
bf 29,2b
lwz 6,0(4)
stw 6,0(3)
bf 30,5f
lhz 7,4(4)
sth 7,4(3)
bf 31,0f
lbz 8,6(4)
stb 8,6(3)
ld 3,-16(1)
blr
.align 4
5: /* Copy 1 byte. */
bf 31,0f
lbz 6,4(4)
stb 6,4(3)
0: /* Return original DST pointer. */
ld 3,-16(1)
blr
/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
SRC is not. Use aligned quadword loads from SRC, shifted to realign
the data, allowing for aligned DST stores. */
.align 4
L(copy_GE_32_unaligned):
clrldi 0,0,60 /* Number of bytes until the 1st
quadword. */
andi. 11,3,15 /* Check alignment of DST (against
quadwords). */
srdi 9,5,4 /* Number of full quadwords remaining. */
clrldi 0,0,60 /* Number of bytes until the 1st dst quadword. */
#ifndef __LITTLE_ENDIAN__
andi. 10,3,15 /* Check alignment of DST (against quadwords). */
#endif
srdi 9,cnt,4 /* Number of full quadwords remaining. */
beq L(copy_GE_32_unaligned_cont)
/* SRC is not quadword aligned, get it aligned. */
/* DST is not quadword aligned, get it aligned. */
mtcrf 0x01,0
subf 31,0,5
mtocrf 0x01,0
subf cnt,0,cnt
/* Vector instructions work best when proper alignment (16-bytes)
is present. Move 0~15 bytes as needed to get DST quadword-aligned. */
1: /* Copy 1 byte. */
1:
bf 31,2f
lbz 6,0(12)
addi 12,12,1
stb 6,0(3)
addi 3,3,1
2: /* Copy 2 bytes. */
lbz 6,0(src)
addi src,src,1
stb 6,0(dst)
addi dst,dst,1
2:
bf 30,4f
lhz 6,0(12)
addi 12,12,2
sth 6,0(3)
addi 3,3,2
4: /* Copy 4 bytes. */
lhz 6,0(src)
addi src,src,2
sth 6,0(dst)
addi dst,dst,2
4:
bf 29,8f
lwz 6,0(12)
addi 12,12,4
stw 6,0(3)
addi 3,3,4
8: /* Copy 8 bytes. */
lwz 6,0(src)
addi src,src,4
stw 6,0(dst)
addi dst,dst,4
8:
bf 28,0f
ld 6,0(12)
addi 12,12,8
std 6,0(3)
addi 3,3,8
ld 6,0(src)
addi src,src,8
std 6,0(dst)
addi dst,dst,8
0:
clrldi 10,12,60 /* Check alignment of SRC. */
srdi 9,31,4 /* Number of full quadwords remaining. */
srdi 9,cnt,4 /* Number of full quadwords remaining. */
/* The proper alignment is present, it is OK to copy the bytes now. */
L(copy_GE_32_unaligned_cont):
/* Setup two indexes to speed up the indexed vector operations. */
clrldi 11,31,60
clrldi 10,cnt,60
li 6,16 /* Index for 16-bytes offsets. */
li 7,32 /* Index for 32-bytes offsets. */
cmpldi cr1,11,0
srdi 8,31,5 /* Setup the loop counter. */
mr 10,3
mr 11,12
mtcrf 0x01,9
cmpldi cr1,10,0
srdi 8,cnt,5 /* Setup the loop counter. */
mtocrf 0x01,9
cmpldi cr6,9,1
lvsl 5,0,12
lvx 3,0,12
#ifdef __LITTLE_ENDIAN__
lvsr 5,0,src
#else
lvsl 5,0,src
#endif
lvx 3,0,src
li 0,0
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
/* Copy another 16 bytes to align to 32-bytes due to the loop. */
lvx 4,src,6
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
addi 11,12,16
addi 10,3,16
stvx 6,0,3
#endif
addi src,src,16
stvx 6,0,dst
addi dst,dst,16
vor 3,4,4
clrrdi 0,src,60
L(setup_unaligned_loop):
mtctr 8
@ -442,62 +385,55 @@ L(unaligned_loop):
some portions again. This is faster than having unaligned
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
vperm 6,3,4,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr6. */
lvx 3,11,7 /* vr3 = r11+32. */
vperm 10,4,3,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr10. */
addi 11,11,32
stvx 6,0,10
stvx 10,10,6
addi 10,10,32
lvx 4,src,6
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
lvx 3,src,7
#ifdef __LITTLE_ENDIAN__
vperm 10,3,4,5
#else
vperm 10,4,3,5
#endif
addi src,src,32
stvx 6,0,dst
stvx 10,dst,6
addi dst,dst,32
bdnz L(unaligned_loop)
clrrdi 0,src,60
.align 4
L(end_unaligned_loop):
/* Check for tail bytes. */
rldicr 0,31,0,59
mtcrf 0x01,31
beq cr1,0f
mtocrf 0x01,cnt
beqlr cr1
add 3,3,0
add 12,12,0
add src,src,0
/* We have 1~15 tail bytes to copy, and DST is quadword aligned. */
8: /* Copy 8 bytes. */
/* Copy 8 bytes. */
bf 28,4f
lwz 6,0(12)
lwz 7,4(12)
addi 12,12,8
stw 6,0(3)
stw 7,4(3)
addi 3,3,8
4: /* Copy 4 bytes. */
bf 29,2f
lwz 6,0(12)
addi 12,12,4
stw 6,0(3)
addi 3,3,4
2: /* Copy 2~3 bytes. */
bf 30,1f
lhz 6,0(12)
addi 12,12,2
sth 6,0(3)
addi 3,3,2
1: /* Copy 1 byte. */
bf 31,0f
lbz 6,0(12)
stb 6,0(3)
0: /* Return original DST pointer. */
ld 31,-8(1)
ld 3,-16(1)
lwz 6,0(src)
lwz 7,4(src)
addi src,src,8
stw 6,0(dst)
stw 7,4(dst)
addi dst,dst,8
4: /* Copy 4~7 bytes. */
bf 29,L(tail2)
lwz 6,0(src)
stw 6,0(dst)
bf 30,L(tail5)
lhz 7,4(src)
sth 7,4(dst)
bflr 31
lbz 8,6(src)
stb 8,6(dst)
/* Return original DST pointer. */
blr
END_GEN_TB (memcpy,TB_TOCLESS)

View File

@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont):
mr 11,12
mtcrf 0x01,9
cmpldi cr6,9,1
#ifdef __LITTLE_ENDIAN__
lvsr 5,0,12
#else
lvsl 5,0,12
#endif
lvx 3,0,12
bf 31,L(setup_unaligned_loop)
/* Copy another 16 bytes to align to 32-bytes due to the loop . */
lvx 4,12,6
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
addi 11,12,16
addi 10,3,16
stvx 6,0,3
@ -391,11 +399,17 @@ L(unaligned_loop):
vector instructions though. */
lvx 4,11,6 /* vr4 = r11+16. */
vperm 6,3,4,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr6. */
#ifdef __LITTLE_ENDIAN__
vperm 6,4,3,5
#else
vperm 6,3,4,5
#endif
lvx 3,11,7 /* vr3 = r11+32. */
vperm 10,4,3,5 /* Merge the correctly-aligned portions
of vr3/vr4 into vr10. */
#ifdef __LITTLE_ENDIAN__
vperm 10,3,4,5
#else
vperm 10,4,3,5
#endif
addi 11,11,32
stvx 6,0,10
stvx 10,10,6