PowerPC LE memcpy

http://sourceware.org/ml/libc-alpha/2013-08/msg00103.html LIttle-endian support for memcpy. I spent some time cleaning up the 64-bit power7 memcpy, in order to avoid the extra alignment traps power7 takes for little-endian. It probably would have been better to copy the linux kernel version of memcpy. * sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support. * sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise. * sysdeps/powerpc/powerpc64/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise. * sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise. Make better use of regs. Use power7 mtocrf. Tidy function tails.
2013-08-17 18:47:22 +09:30 · 2013-08-17 18:47:22 +09:30 · 759cfef3ac
commit 759cfef3ac
parent fe6e95d717
10 changed files with 937 additions and 406 deletions
--- a/13
+++ b/13
@ -1,3 +1,16 @@
+2013-10-04  Alan Modra  <amodra@gmail.com>
+
+	* sysdeps/powerpc/powerpc32/power4/memcpy.S: Add little endian support.
+	* sysdeps/powerpc/powerpc32/power6/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc32/power7/mempcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power4/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power6/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/memcpy.S: Likewise.
+	* sysdeps/powerpc/powerpc64/power7/mempcpy.S: Likewise.  Make better
+	use of regs.  Use power7 mtocrf.  Tidy function tails.
+
 2013-10-04  Alan Modra  <amodra@gmail.com>

 	* sysdeps/powerpc/powerpc64/power7/memcmp.S: Add little-endian support.
--- a/sysdeps/powerpc/powerpc32/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power4/memcpy.S
@ -203,15 +203,28 @@ EALIGN (memcpy, 5, 0)
    blt   cr6,5f
    srwi  7,6,16
    bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
    sth   6,0(3)
+#endif
    b     7f
    .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
    stb   7,0(3)
    sth   6,1(3)
+#endif
    b     7f
    .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
    stb   6,0(3)
 7:
    cmplwi	cr1,10,16
@ -339,13 +352,23 @@ EALIGN (memcpy, 5, 0)
    bf      30,1f

    /* there are at least two words to copy, so copy them */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
    slw   0,6,10  /* shift 1st src word to left align it in R0 */
    srw   8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
    or    0,0,8   /* or them to get word to store */
    lwz   6,8(5)  /* load the 3rd src word */
    stw   0,0(4)  /* store the 1st dst word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
    slw   0,7,10  /* now left align 2nd src word into R0 */
    srw   8,6,9   /* shift 3rd src word to right align it in R8 */
+#endif
    or    0,0,8   /* or them to get word to store */
    lwz   7,12(5)
    stw   0,4(4)  /* store the 2nd dst word */
@ -353,8 +376,13 @@ EALIGN (memcpy, 5, 0)
    addi  5,5,16
    bf    31,4f
    /* there is a third word to copy, so copy it */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
    slw   0,6,10  /* shift 3rd src word to left align it in R0 */
    srw   8,7,9   /* shift 4th src word to right align it in R8 */
+#endif
    or    0,0,8   /* or them to get word to store */
    stw   0,0(4)  /* store 3rd dst word */
    mr    6,7
@ -364,8 +392,13 @@ EALIGN (memcpy, 5, 0)
    b     4f
    .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srw     0,6,10
+    slw     8,7,9
+#else
    slw     0,6,10  /* shift 1st src word to left align it in R0 */
    srw     8,7,9   /* shift 2nd src word to right align it in R8 */
+#endif
    addi  5,5,8
    or    0,0,8   /* or them to get word to store */
    bf    31,4f
@ -378,23 +411,43 @@ EALIGN (memcpy, 5, 0)
    .align  4
 4:
    /* copy 16 bytes at a time */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
    slw   0,6,10
    srw   8,7,9
+#endif
    or    0,0,8
    lwz   6,0(5)
    stw   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
    slw   0,7,10
    srw   8,6,9
+#endif
    or    0,0,8
    lwz   7,4(5)
    stw   0,4(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
    slw   0,6,10
    srw   8,7,9
+#endif
    or    0,0,8
    lwz   6,8(5)
    stw   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srw   0,7,10
+    slw   8,6,9
+#else
    slw   0,7,10
    srw   8,6,9
+#endif
    or    0,0,8
    lwz   7,12(5)
    stw   0,12(4)
@ -403,8 +456,13 @@ EALIGN (memcpy, 5, 0)
    bdnz+ 4b
 8:
    /* calculate and store the final word */
+#ifdef __LITTLE_ENDIAN__
+    srw   0,6,10
+    slw   8,7,9
+#else
    slw   0,6,10
    srw   8,7,9
+#endif
    or    0,0,8
    stw   0,0(4)
 3:
--- a/sysdeps/powerpc/powerpc32/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power6/memcpy.S
@ -219,15 +219,28 @@ L(word_unaligned_short):
    blt   cr6,5f
    srwi  7,6,16
    bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
    sth   6,0(3)
+#endif
    b     7f
    .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
    stb   7,0(3)
    sth   6,1(3)
+#endif
    b     7f
    .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
    stb   6,0(3)
 7:
    cmplwi	cr1,10,16
@ -577,7 +590,11 @@ L(wdu1_32):
    lwz     6,-1(4)
    cmplwi  cr6,31,4
    srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,8
+#else
    slwi    6,6,8
+#endif
    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
    blt     cr5,L(wdu1_32tail)
    mtctr   8
@ -585,8 +602,12 @@ L(wdu1_32):

    lwz   8,3(4)
    lwz   7,4(4)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
    rlwimi 6,8,8,(32-8),31
+#endif
    b      L(wdu1_loop32x)
    .align  4
 L(wdu1_loop32):
@ -595,8 +616,12 @@ L(wdu1_loop32):
    lwz   7,4(4)
    stw   10,-8(3)
    stw   11,-4(3)
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
 /*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
    rlwimi 6,8,8,(32-8),31
+#endif
 L(wdu1_loop32x):
    lwz   10,8(4)
    lwz   11,12(4)
@ -613,7 +638,11 @@ L(wdu1_loop32x):
    stw   6,16(3)
    stw   7,20(3)
    addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,8
+#else
    slwi  6,8,8
+#endif
    bdnz+ L(wdu1_loop32)
    stw   10,-8(3)
    stw   11,-4(3)
@ -624,8 +653,12 @@ L(wdu1_32tail):
    blt     cr6,L(wdu_4tail)
    /* calculate and store the final word */
    lwz   8,3(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,24,32
+#else
+/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8  */
    rlwimi 6,8,8,(32-8),31
+#endif
    b     L(wdu_32tailx)

 L(wdu2_32):
@ -633,7 +666,11 @@ L(wdu2_32):
    lwz     6,-2(4)
    cmplwi  cr6,31,4
    srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,16
+#else
    slwi    6,6,16
+#endif
    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
    blt     cr5,L(wdu2_32tail)
    mtctr   8
@ -641,8 +678,11 @@ L(wdu2_32):

    lwz   8,2(4)
    lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
    rlwimi 6,8,16,(32-16),31
+#endif
    b      L(wdu2_loop32x)
    .align  4
 L(wdu2_loop32):
@ -651,8 +691,11 @@ L(wdu2_loop32):
    lwz   7,4(4)
    stw   10,-8(3)
    stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
    rlwimi 6,8,16,(32-16),31
+#endif
 L(wdu2_loop32x):
    lwz   10,8(4)
    lwz   11,12(4)
@ -670,7 +713,11 @@ L(wdu2_loop32x):
    stw   6,16(3)
    stw   7,20(3)
    addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,16
+#else
    slwi  6,8,16
+#endif
    bdnz+ L(wdu2_loop32)
    stw   10,-8(3)
    stw   11,-4(3)
@ -681,8 +728,11 @@ L(wdu2_32tail):
    blt     cr6,L(wdu_4tail)
    /* calculate and store the final word */
    lwz   8,2(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,16,32
+#else
    rlwimi 6,8,16,(32-16),31
+#endif
    b     L(wdu_32tailx)

 L(wdu3_32):
@ -690,7 +740,11 @@ L(wdu3_32):
    lwz     6,-3(4)
    cmplwi  cr6,31,4
    srwi    8,31,5    /* calculate the 32 byte loop count */
+#ifdef __LITTLE_ENDIAN__
+    srwi    6,6,24
+#else
    slwi    6,6,24
+#endif
    clrlwi  31,31,27   /* The remaining bytes, < 32.  */
    blt     cr5,L(wdu3_32tail)
    mtctr   8
@ -698,8 +752,11 @@ L(wdu3_32):

    lwz   8,1(4)
    lwz   7,4(4)
-/*  Equivalent to: srwi   8,8,32-8;  or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
    rlwimi 6,8,24,(32-24),31
+#endif
    b      L(wdu3_loop32x)
    .align  4
 L(wdu3_loop32):
@ -708,8 +765,11 @@ L(wdu3_loop32):
    lwz   7,4(4)
    stw   10,-8(3)
    stw   11,-4(3)
-/*  Equivalent to  srwi   8,8,32-8; or    6,6,8 */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
    rlwimi 6,8,24,(32-24),31
+#endif
 L(wdu3_loop32x):
    lwz   10,8(4)
    lwz   11,12(4)
@ -726,7 +786,11 @@ L(wdu3_loop32x):
    stw   6,16(3)
    stw   7,20(3)
    addi  3,3,32
+#ifdef __LITTLE_ENDIAN__
+    srwi  6,8,24
+#else
    slwi  6,8,24
+#endif
    bdnz+ L(wdu3_loop32)
    stw   10,-8(3)
    stw   11,-4(3)
@ -737,8 +801,11 @@ L(wdu3_32tail):
    blt     cr6,L(wdu_4tail)
    /* calculate and store the final word */
    lwz   8,1(4)
-/*  Equivalent to: srwi   8,8,32-9;  or    6,6,8  */
+#ifdef __LITTLE_ENDIAN__
+    rldimi 6,8,8,32
+#else
    rlwimi 6,8,24,(32-24),31
+#endif
    b     L(wdu_32tailx)
    .align  4
 L(wdu_32tailx):
--- a/sysdeps/powerpc/powerpc32/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/memcpy.S
@ -383,7 +383,7 @@ L(copy_GE_32_unaligned):

 	beq    L(copy_GE_32_unaligned_cont)

-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */

 	mtcrf   0x01,0
 	subf    31,0,5
@ -435,13 +435,21 @@ L(copy_GE_32_unaligned_cont):
 	mr      11,12
 	mtcrf   0x01,9
 	cmplwi  cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
 	lvsl    5,0,12
+#endif
 	lvx     3,0,12
 	bf      31,L(setup_unaligned_loop)

 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx     4,12,6
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
 	vperm   6,3,4,5
+#endif
 	addi    11,12,16
 	addi    10,3,16
 	stvx    6,0,3
@ -461,11 +469,17 @@ L(unaligned_loop):
 	vector instructions though.  */

 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi    11,11,32
 	stvx    6,0,10
 	stvx    10,10,6
--- a/sysdeps/powerpc/powerpc32/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc32/power7/mempcpy.S
@ -325,7 +325,7 @@ L(copy_GE_32_unaligned):

 	beq	L(copy_GE_32_unaligned_cont)

-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */

 	mtcrf	0x01,0
 	subf	31,0,5
@ -377,13 +377,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmplwi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)

 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@ -403,11 +411,17 @@ L(unaligned_loop):
 	vector instructions though.  */

 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6
--- a/sysdeps/powerpc/powerpc64/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/memcpy.S
@ -212,15 +212,28 @@ EALIGN (memcpy, 5, 0)
    blt   cr6,5f
    srdi  7,6,16
    bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
    sth   6,0(3)
+#endif
    b     7f
    .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
    stb   7,0(3)
    sth   6,1(3)
+#endif
    b     7f
    .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
    stb   6,0(3)
 7:
    cmpldi	cr1,10,16
@ -328,7 +341,11 @@ EALIGN (memcpy, 5, 0)
    ld    7,8(5)
    subfic  9,10,64
    beq   2f
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+#else
    sld   0,6,10
+#endif
    cmpldi  11,1
    mr    6,7
    addi  4,4,-8
@ -336,15 +353,25 @@ EALIGN (memcpy, 5, 0)
    b     1f
 2:  addi  5,5,8
    .align  4
+#ifdef __LITTLE_ENDIAN__
+0:  srd   0,6,10
+    sld   8,7,9
+#else
 0:  sld   0,6,10
    srd   8,7,9
+#endif
    cmpldi  11,2
    ld    6,8(5)
    or    0,0,8
    addi  11,11,-2
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+1:  sld   8,6,9
+#else
    sld   0,7,10
 1:  srd   8,6,9
+#endif
    or    0,0,8
    beq   8f
    ld    7,16(5)
--- a/sysdeps/powerpc/powerpc64/power4/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power4/memcpy.S
@ -214,15 +214,28 @@ EALIGN (memcpy, 5, 0)
    blt   cr6,5f
    srdi  7,6,16
    bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
    sth   6,0(3)
+#endif
    b     7f
    .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
    stb   7,0(3)
    sth   6,1(3)
+#endif
    b     7f
    .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
    stb   6,0(3)
 7:
    cmpldi	cr1,10,16
@ -334,13 +347,23 @@ EALIGN (memcpy, 5, 0)
    bf      30,1f

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
    sld     0,6,10
    srd     8,7,9
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd     0,7,10
+    sld     8,6,9
+#else
    sld     0,7,10
    srd     8,6,9
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -349,8 +372,13 @@ EALIGN (memcpy, 5, 0)
    blt     cr6,8f  /* if total DWs = 3, then bypass loop */
    bf      31,4f
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
    sld     0,6,10
    srd     8,7,9
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -361,8 +389,13 @@ EALIGN (memcpy, 5, 0)
    b       4f
    .align 4
 1:
+#ifdef __LITTLE_ENDIAN__
+    srd     0,6,10
+    sld     8,7,9
+#else
    sld     0,6,10
    srd     8,7,9
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,4f
@ -373,23 +406,44 @@ EALIGN (memcpy, 5, 0)
    addi    4,4,8
    .align 4
 /* copy 32 bytes at a time */
-4:  sld   0,6,10
+4:
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
+    sld   0,6,10
    srd   8,7,9
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+    sld   8,6,9
+#else
    sld   0,7,10
    srd   8,6,9
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
    sld   0,6,10
    srd   8,7,9
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srd   0,7,10
+    sld   8,6,9
+#else
    sld   0,7,10
    srd   8,6,9
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -399,8 +453,13 @@ EALIGN (memcpy, 5, 0)
    .align 4
 8:
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srd   0,6,10
+    sld   8,7,9
+#else
    sld   0,6,10
    srd   8,7,9
+#endif
    or    0,0,8
    std   0,0(4)
 3:
--- a/sysdeps/powerpc/powerpc64/power6/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power6/memcpy.S
@ -400,15 +400,28 @@ L(das_tail2):
    blt   cr6,5f
    srdi  7,6,16
    bgt	  cr6,3f
+#ifdef __LITTLE_ENDIAN__
+    sth   7,0(3)
+#else
    sth   6,0(3)
+#endif
    b     7f
    .align  4
 3:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,24
+    stb   6,0(3)
+    sth   7,1(3)
+#else
    stb   7,0(3)
    sth   6,1(3)
+#endif
    b     7f
    .align  4
 5:
+#ifdef __LITTLE_ENDIAN__
+    rotlwi 6,6,8
+#endif
    stb   6,0(3)
 7:
    cmpldi	cr1,10,16
@ -595,13 +608,24 @@ L(du1_do):
    bf      30,L(du1_1dw)

    /* there are at least two DWs to copy */
+    /* FIXME: can combine last shift and "or" into "rldimi" */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
    sldi     0,6, 8
    srdi     8,7, 64-8
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 8
+    sldi     8,6, 64-8
+#else
    sldi     0,7, 8
    srdi     8,6, 64-8
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -610,8 +634,13 @@ L(du1_do):
    blt     cr6,L(du1_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du1_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
    sldi     0,6, 8
    srdi     8,7, 64-8
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -622,8 +651,13 @@ L(du1_do):
    b       L(du1_loop)
    .align 4
 L(du1_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 8
+    sldi     8,7, 64-8
+#else
    sldi     0,6, 8
    srdi     8,7, 64-8
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du1_loop)
@ -635,23 +669,43 @@ L(du1_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du1_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
    sldi   0,6, 8
    srdi   8,7, 64-8
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
    sldi   0,7, 8
    srdi   8,6, 64-8
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
    sldi   0,6, 8
    srdi   8,7, 64-8
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 8
+    sldi   8,6, 64-8
+#else
    sldi   0,7, 8
    srdi   8,6, 64-8
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -661,8 +715,13 @@ L(du1_loop):
    .align 4
 L(du1_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 8
+    sldi   8,7, 64-8
+#else
    sldi   0,6, 8
    srdi   8,7, 64-8
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
@ -672,13 +731,23 @@ L(du2_do):
    bf      30,L(du2_1dw)

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
    sldi     0,6, 16
    srdi     8,7, 64-16
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 16
+    sldi     8,6, 64-16
+#else
    sldi     0,7, 16
    srdi     8,6, 64-16
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -687,8 +756,13 @@ L(du2_do):
    blt     cr6,L(du2_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du2_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
    sldi     0,6, 16
    srdi     8,7, 64-16
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -699,8 +773,13 @@ L(du2_do):
    b       L(du2_loop)
    .align 4
 L(du2_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 16
+    sldi     8,7, 64-16
+#else
    sldi     0,6, 16
    srdi     8,7, 64-16
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du2_loop)
@ -712,23 +791,43 @@ L(du2_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du2_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
    sldi   0,6, 16
    srdi   8,7, 64-16
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
    sldi   0,7, 16
    srdi   8,6, 64-16
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
    sldi   0,6, 16
    srdi   8,7, 64-16
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 16
+    sldi   8,6, 64-16
+#else
    sldi   0,7, 16
    srdi   8,6, 64-16
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -738,8 +837,13 @@ L(du2_loop):
    .align 4
 L(du2_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 16
+    sldi   8,7, 64-16
+#else
    sldi   0,6, 16
    srdi   8,7, 64-16
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
@ -749,13 +853,23 @@ L(du3_do):
    bf      30,L(du3_1dw)

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
    sldi     0,6, 24
    srdi     8,7, 64-24
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 24
+    sldi     8,6, 64-24
+#else
    sldi     0,7, 24
    srdi     8,6, 64-24
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -764,8 +878,13 @@ L(du3_do):
    blt     cr6,L(du3_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du3_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
    sldi     0,6, 24
    srdi     8,7, 64-24
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -776,8 +895,13 @@ L(du3_do):
    b       L(du3_loop)
    .align 4
 L(du3_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 24
+    sldi     8,7, 64-24
+#else
    sldi     0,6, 24
    srdi     8,7, 64-24
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du3_loop)
@ -789,23 +913,43 @@ L(du3_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du3_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
    sldi   0,6, 24
    srdi   8,7, 64-24
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
    sldi   0,7, 24
    srdi   8,6, 64-24
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
    sldi   0,6, 24
    srdi   8,7, 64-24
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 24
+    sldi   8,6, 64-24
+#else
    sldi   0,7, 24
    srdi   8,6, 64-24
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -815,8 +959,13 @@ L(du3_loop):
    .align 4
 L(du3_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 24
+    sldi   8,7, 64-24
+#else
    sldi   0,6, 24
    srdi   8,7, 64-24
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
@ -832,13 +981,23 @@ L(du4_dox):
    bf      30,L(du4_1dw)

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
    sldi     0,6, 32
    srdi     8,7, 64-32
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 32
+    sldi     8,6, 64-32
+#else
    sldi     0,7, 32
    srdi     8,6, 64-32
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -847,8 +1006,13 @@ L(du4_dox):
    blt     cr6,L(du4_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du4_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
    sldi     0,6, 32
    srdi     8,7, 64-32
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -859,8 +1023,13 @@ L(du4_dox):
    b       L(du4_loop)
    .align 4
 L(du4_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 32
+    sldi     8,7, 64-32
+#else
    sldi     0,6, 32
    srdi     8,7, 64-32
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du4_loop)
@ -872,23 +1041,43 @@ L(du4_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du4_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
    sldi   0,6, 32
    srdi   8,7, 64-32
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
    sldi   0,7, 32
    srdi   8,6, 64-32
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
    sldi   0,6, 32
    srdi   8,7, 64-32
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 32
+    sldi   8,6, 64-32
+#else
    sldi   0,7, 32
    srdi   8,6, 64-32
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -898,8 +1087,13 @@ L(du4_loop):
    .align 4
 L(du4_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 32
+    sldi   8,7, 64-32
+#else
    sldi   0,6, 32
    srdi   8,7, 64-32
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
@ -909,13 +1103,23 @@ L(du5_do):
    bf      30,L(du5_1dw)

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
    sldi     0,6, 40
    srdi     8,7, 64-40
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 40
+    sldi     8,6, 64-40
+#else
    sldi     0,7, 40
    srdi     8,6, 64-40
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -924,8 +1128,13 @@ L(du5_do):
    blt     cr6,L(du5_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du5_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
    sldi     0,6, 40
    srdi     8,7, 64-40
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -936,8 +1145,13 @@ L(du5_do):
    b       L(du5_loop)
    .align 4
 L(du5_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 40
+    sldi     8,7, 64-40
+#else
    sldi     0,6, 40
    srdi     8,7, 64-40
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du5_loop)
@ -949,23 +1163,43 @@ L(du5_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du5_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
    sldi   0,6, 40
    srdi   8,7, 64-40
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
    sldi   0,7, 40
    srdi   8,6, 64-40
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
    sldi   0,6, 40
    srdi   8,7, 64-40
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 40
+    sldi   8,6, 64-40
+#else
    sldi   0,7, 40
    srdi   8,6, 64-40
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -975,8 +1209,13 @@ L(du5_loop):
    .align 4
 L(du5_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 40
+    sldi   8,7, 64-40
+#else
    sldi   0,6, 40
    srdi   8,7, 64-40
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
@ -986,13 +1225,23 @@ L(du6_do):
    bf      30,L(du6_1dw)

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
    sldi     0,6, 48
    srdi     8,7, 64-48
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 48
+    sldi     8,6, 64-48
+#else
    sldi     0,7, 48
    srdi     8,6, 64-48
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -1001,8 +1250,13 @@ L(du6_do):
    blt     cr6,L(du6_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du6_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
    sldi     0,6, 48
    srdi     8,7, 64-48
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -1013,8 +1267,13 @@ L(du6_do):
    b       L(du6_loop)
    .align 4
 L(du6_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 48
+    sldi     8,7, 64-48
+#else
    sldi     0,6, 48
    srdi     8,7, 64-48
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du6_loop)
@ -1026,23 +1285,43 @@ L(du6_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du6_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
    sldi   0,6, 48
    srdi   8,7, 64-48
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
    sldi   0,7, 48
    srdi   8,6, 64-48
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
    sldi   0,6, 48
    srdi   8,7, 64-48
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 48
+    sldi   8,6, 64-48
+#else
    sldi   0,7, 48
    srdi   8,6, 64-48
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -1052,8 +1331,13 @@ L(du6_loop):
    .align 4
 L(du6_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 48
+    sldi   8,7, 64-48
+#else
    sldi   0,6, 48
    srdi   8,7, 64-48
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
@ -1063,13 +1347,23 @@ L(du7_do):
    bf      30,L(du7_1dw)

    /* there are at least two DWs to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
    sldi     0,6, 56
    srdi     8,7, 64-56
+#endif
    or      0,0,8
    ld      6,16(5)
    std     0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,7, 56
+    sldi     8,6, 64-56
+#else
    sldi     0,7, 56
    srdi     8,6, 64-56
+#endif
    or      0,0,8
    ld      7,24(5)
    std     0,8(4)
@ -1078,8 +1372,13 @@ L(du7_do):
    blt     cr6,L(du7_fini)  /* if total DWs = 3, then bypass loop */
    bf      31,L(du7_loop)
    /* there is a third DW to copy */
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
    sldi     0,6, 56
    srdi     8,7, 64-56
+#endif
    or      0,0,8
    std     0,0(4)
    mr      6,7
@ -1090,8 +1389,13 @@ L(du7_do):
    b       L(du7_loop)
    .align 4
 L(du7_1dw):
+#ifdef __LITTLE_ENDIAN__
+    srdi     0,6, 56
+    sldi     8,7, 64-56
+#else
    sldi     0,6, 56
    srdi     8,7, 64-56
+#endif
    addi    5,5,16
    or      0,0,8
    bf      31,L(du7_loop)
@ -1103,23 +1407,43 @@ L(du7_1dw):
    .align 4
 /* copy 32 bytes at a time */
 L(du7_loop):
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
    sldi   0,6, 56
    srdi   8,7, 64-56
+#endif
    or    0,0,8
    ld    6,0(5)
    std   0,0(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
    sldi   0,7, 56
    srdi   8,6, 64-56
+#endif
    or    0,0,8
    ld    7,8(5)
    std   0,8(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
    sldi   0,6, 56
    srdi   8,7, 64-56
+#endif
    or    0,0,8
    ld    6,16(5)
    std   0,16(4)
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,7, 56
+    sldi   8,6, 64-56
+#else
    sldi   0,7, 56
    srdi   8,6, 64-56
+#endif
    or    0,0,8
    ld    7,24(5)
    std   0,24(4)
@ -1129,8 +1453,13 @@ L(du7_loop):
    .align 4
 L(du7_fini):
    /* calculate and store the final DW */
+#ifdef __LITTLE_ENDIAN__
+    srdi   0,6, 56
+    sldi   8,7, 64-56
+#else
    sldi   0,6, 56
    srdi   8,7, 64-56
+#endif
    or    0,0,8
    std   0,0(4)
    b     L(du_done)
--- a/sysdeps/powerpc/powerpc64/power7/memcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/memcpy.S
@ -23,418 +23,361 @@
 /* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
   Returns 'dst'.  */

+#define dst 11		/* Use r11 so r3 kept unchanged.  */
+#define src 4
+#define cnt 5
+
 	.machine power7
 EALIGN (memcpy, 5, 0)
 	CALL_MCOUNT 3

-	cmpldi  cr1,5,31
+	cmpldi	cr1,cnt,31
 	neg	0,3
-	std	3,-16(1)
-	std	31,-8(1)
-	cfi_offset(31,-8)
 	ble	cr1, L(copy_LT_32)  /* If move < 32 bytes use short move
 				    code.  */

-	andi.   11,3,7	      /* Check alignment of DST.  */
+#ifdef __LITTLE_ENDIAN__
+/* In little-endian mode, power7 takes an alignment trap on any lxvd2x
+   or stxvd2x crossing a 32-byte boundary, so ensure the aligned_copy
+   loop is only used for quadword aligned copies.  */
+	andi.	10,3,15
+	clrldi	11,4,60
+#else
+	andi.	10,3,7		/* Check alignment of DST.  */
+	clrldi	11,4,61		/* Check alignment of SRC.  */
+#endif
+	cmpld	cr6,10,11	/* SRC and DST alignments match?  */

-
-	clrldi  10,4,61       /* Check alignment of SRC.  */
-	cmpld   cr6,10,11     /* SRC and DST alignments match?  */
-	mr	12,4
-	mr	31,5
+	mr	dst,3
 	bne	cr6,L(copy_GE_32_unaligned)
+	beq	L(aligned_copy)

-	srdi    9,5,3	      /* Number of full quadwords remaining.  */
+	mtocrf	0x01,0
+#ifdef __LITTLE_ENDIAN__
+	clrldi	0,0,60
+#else
+	clrldi	0,0,61
+#endif

-	beq    L(copy_GE_32_aligned_cont)
-
-	clrldi  0,0,61
-	mtcrf   0x01,0
-	subf    31,0,5
-
-	/* Get the SRC aligned to 8 bytes.  */
-
-1:	bf	31,2f
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-2:	bf      30,4f
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-4:	bf      29,0f
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-0:
-	clrldi  10,12,61      /* Check alignment of SRC again.  */
-	srdi    9,31,3	      /* Number of full doublewords remaining.  */
-
-L(copy_GE_32_aligned_cont):
-
-	clrldi  11,31,61
-	mtcrf   0x01,9
-
-	srdi    8,31,5
-	cmpldi  cr1,9,4
-	cmpldi  cr6,11,0
-	mr	11,12
-
-	/* Copy 1~3 doublewords so the main loop starts
-	at a multiple of 32 bytes.  */
-
-	bf	30,1f
-	ld      6,0(12)
-	ld      7,8(12)
-	addi    11,12,16
-	mtctr   8
-	std     6,0(3)
-	std     7,8(3)
-	addi    10,3,16
-	bf      31,4f
-	ld      0,16(12)
-	std     0,16(3)
-	blt     cr1,3f
-	addi    11,12,24
-	addi    10,3,24
-	b       4f
-
-	.align  4
-1:	/* Copy 1 doubleword and set the counter.  */
-	mr	10,3
-	mtctr   8
-	bf      31,4f
-	ld      6,0(12)
-	addi    11,12,8
-	std     6,0(3)
-	addi    10,3,8
-
-L(aligned_copy):
-	/* Main aligned copy loop. Copies up to 128-bytes at a time. */
-	.align  4
+/* Get the DST and SRC aligned to 8 bytes (16 for little-endian).  */
+1:
+	bf	31,2f
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
+	bf	30,4f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
 4:
-	/* check for any 32-byte or 64-byte lumps that are outside of a
-	   nice 128-byte range.  R8 contains the number of 32-byte
-	   lumps, so drop this into the CR, and use the SO/EQ bits to help
-	   handle the 32- or 64- byte lumps.  Then handle the rest with an
-	   unrolled 128-bytes-at-a-time copy loop. */
-	mtocrf	1,8
-	li	6,16	# 16() index
-	li	7,32	# 32() index
-	li	8,48	# 48() index
+	bf	29,8f
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
+#ifdef __LITTLE_ENDIAN__
+	bf	28,16f
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
+16:
+#endif
+	subf	cnt,0,cnt

-L(aligned_32byte):
-	/* if the SO bit (indicating a 32-byte lump) is not set, move along. */
-	bns	cr7,L(aligned_64byte)
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	addi	11,11,32
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	addi	10,10,32
-
-L(aligned_64byte):
-	/* if the EQ bit (indicating a 64-byte lump) is not set, move along. */
-	bne	cr7,L(aligned_128setup)
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	addi	11,11,64
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	addi	10,10,64
-
-L(aligned_128setup):
-	/* Set up for the 128-byte at a time copy loop.  */
-	srdi	8,31,7
-	cmpdi	8,0	# Any 4x lumps left?
-	beq	3f	# if not, move along.
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	mtctr	8	# otherwise, load the ctr and begin.
-	li	8,48	# 48() index
+/* Main aligned copy loop. Copies 128 bytes at a time. */
+L(aligned_copy):
+	li	6,16
+	li	7,32
+	li	8,48
+	mtocrf	0x02,cnt
+	srdi	12,cnt,7
+	cmpdi	12,0
+	beq	L(aligned_tail)
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	mtctr	12
 	b	L(aligned_128loop)

+	.align  4
 L(aligned_128head):
 	/* for the 2nd + iteration of this loop. */
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
 L(aligned_128loop):
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	stxvd2x	6,0,10
-	addi	11,11,64
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	lxvd2x	6,0,11
-	lxvd2x	7,11,6
-	addi	10,10,64
-	lxvd2x	8,11,7
-	lxvd2x	9,11,8
-	addi	11,11,64
-	stxvd2x	6,0,10
-	stxvd2x	7,10,6
-	stxvd2x	8,10,7
-	stxvd2x	9,10,8
-	addi	10,10,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	stxvd2x	6,0,dst
+	addi	src,src,64
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	dst,dst,64
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
 	bdnz	L(aligned_128head)

-3:
-	/* Check for tail bytes.  */
-	rldicr  0,31,0,60
-	mtcrf   0x01,31
-	beq	cr6,0f
-
-.L9:
-	add	3,3,0
-	add	12,12,0
-
-	/*  At this point we have a tail of 0-7 bytes and we know that the
-	destination is doubleword-aligned.  */
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-2:	/* Copy 2 bytes.  */
-	bf	30,1f
-
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	31,-8(1)
-	ld	3,-16(1)
+L(aligned_tail):
+	mtocrf	0x01,cnt
+	bf	25,32f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	lxvd2x	8,src,7
+	lxvd2x	9,src,8
+	addi	src,src,64
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	stxvd2x	8,dst,7
+	stxvd2x	9,dst,8
+	addi	dst,dst,64
+32:
+	bf	26,16f
+	lxvd2x	6,0,src
+	lxvd2x	7,src,6
+	addi	src,src,32
+	stxvd2x	6,0,dst
+	stxvd2x	7,dst,6
+	addi	dst,dst,32
+16:
+	bf	27,8f
+	lxvd2x	6,0,src
+	addi	src,src,16
+	stxvd2x	6,0,dst
+	addi	dst,dst,16
+8:
+	bf	28,4f
+	ld	6,0(src)
+	addi	src,src,8
+	std     6,0(dst)
+	addi	dst,dst,8
+4:	/* Copies 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw     6,0(dst)
+	bf      30,L(tail5)
+	lhz     7,4(src)
+	sth     7,4(dst)
+	bflr	31
+	lbz     8,6(src)
+	stb     8,6(dst)
+	/* Return original DST pointer.  */
 	blr

-	/* Handle copies of 0~31 bytes.  */
-	.align  4
+
+/* Handle copies of 0~31 bytes.  */
+	.align	4
 L(copy_LT_32):
-	cmpldi  cr6,5,8
-	mr	12,4
-	mtcrf   0x01,5
+	mr	dst,3
+	cmpldi	cr6,cnt,8
+	mtocrf	0x01,cnt
 	ble	cr6,L(copy_LE_8)

 	/* At least 9 bytes to go.  */
 	neg	8,4
-	clrrdi  11,4,2
-	andi.   0,8,3
-	cmpldi  cr1,5,16
-	mr	10,5
+	andi.	0,8,3
+	cmpldi	cr1,cnt,16
 	beq	L(copy_LT_32_aligned)

-	/* Force 4-bytes alignment for SRC.  */
-	mtocrf  0x01,0
-	subf    10,0,5
-2:	bf	30,1f
+	/* Force 4-byte alignment for SRC.  */
+	mtocrf	0x01,0
+	subf	cnt,0,cnt
+2:
+	bf	30,1f
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+1:
+	bf	31,L(end_4bytes_alignment)
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1

-	lhz	6,0(12)
-	addi    12,12,2
-	sth	6,0(3)
-	addi    3,3,2
-1:	bf	31,L(end_4bytes_alignment)
-
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-
-	.align  4
+	.align	4
 L(end_4bytes_alignment):
-	cmpldi  cr1,10,16
-	mtcrf   0x01,10
+	cmpldi	cr1,cnt,16
+	mtocrf	0x01,cnt

 L(copy_LT_32_aligned):
 	/* At least 6 bytes to go, and SRC is word-aligned.  */
 	blt	cr1,8f

 	/* Copy 16 bytes.  */
-	lwz	6,0(12)
-	lwz     7,4(12)
-	stw     6,0(3)
-	lwz     8,8(12)
-	stw     7,4(3)
-	lwz     6,12(12)
-	addi    12,12,16
-	stw     8,8(3)
-	stw     6,12(3)
-	addi    3,3,16
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	lwz	8,8(src)
+	stw	7,4(dst)
+	lwz	6,12(src)
+	addi	src,src,16
+	stw	8,8(dst)
+	stw	6,12(dst)
+	addi	dst,dst,16
 8:	/* Copy 8 bytes.  */
-	bf	28,4f
+	bf	28,L(tail4)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8

-	lwz     6,0(12)
-	lwz     7,4(12)
-	addi    12,12,8
-	stw     6,0(3)
-	stw     7,4(3)
-	addi    3,3,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
+	.align	4
+/* Copies 4~7 bytes.  */
+L(tail4):
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
+	blr

-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-2:	/* Copy 2-3 bytes.  */
+	.align	4
+/* Copies 2~3 bytes.  */
+L(tail2):
 	bf	30,1f
-
-	lhz     6,0(12)
-	sth     6,0(3)
-	bf      31,0f
-	lbz     7,2(12)
-	stb     7,2(3)
-	ld	3,-16(1)
+	lhz	6,0(src)
+	sth	6,0(dst)
+	bflr	31
+	lbz	7,2(src)
+	stb	7,2(dst)
 	blr

-	.align  4
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	3,-16(1)
+	.align	4
+L(tail5):
+	bflr	31
+	lbz	6,4(src)
+	stb	6,4(dst)
 	blr

-	/* Handles copies of 0~8 bytes.  */
-	.align  4
+	.align	4
+1:
+	bflr	31
+	lbz	6,0(src)
+	stb	6,0(dst)
+	/* Return original DST pointer.  */
+	blr
+
+
+/* Handles copies of 0~8 bytes.  */
+	.align	4
 L(copy_LE_8):
-	bne	cr6,4f
+	bne	cr6,L(tail4)

 	/* Though we could've used ld/std here, they are still
 	slow for unaligned cases.  */

-	lwz	6,0(4)
-	lwz     7,4(4)
-	stw     6,0(3)
-	stw     7,4(3)
-	ld      3,-16(1)      /* Return original DST pointers.  */
+	lwz	6,0(src)
+	lwz	7,4(src)
+	stw	6,0(dst)
+	stw	7,4(dst)
 	blr

-	.align  4
-4:	/* Copies 4~7 bytes.  */
-	bf	29,2b

-	lwz	6,0(4)
-	stw     6,0(3)
-	bf      30,5f
-	lhz     7,4(4)
-	sth     7,4(3)
-	bf      31,0f
-	lbz     8,6(4)
-	stb     8,6(3)
-	ld	3,-16(1)
-	blr
-
-	.align  4
-5:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,4(4)
-	stb	6,4(3)
-
-0:	/* Return original DST pointer.  */
-	ld	3,-16(1)
-	blr
-
-	/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
-	SRC is not.  Use aligned quadword loads from SRC, shifted to realign
-	the data, allowing for aligned DST stores.  */
-	.align  4
+/* Handle copies of 32+ bytes where DST is aligned (to quadword) but
+   SRC is not.	Use aligned quadword loads from SRC, shifted to realign
+   the data, allowing for aligned DST stores.  */
+	.align	4
 L(copy_GE_32_unaligned):
-	clrldi  0,0,60	      /* Number of bytes until the 1st
-			      quadword.  */
-	andi.   11,3,15       /* Check alignment of DST (against
-			      quadwords).  */
-	srdi    9,5,4	      /* Number of full quadwords remaining.  */
+	clrldi	0,0,60	      /* Number of bytes until the 1st dst quadword.  */
+#ifndef __LITTLE_ENDIAN__
+	andi.	10,3,15	      /* Check alignment of DST (against quadwords).  */
+#endif
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */

 	beq	L(copy_GE_32_unaligned_cont)

-	/* SRC is not quadword aligned, get it aligned.  */
+	/* DST is not quadword aligned, get it aligned.  */

-	mtcrf   0x01,0
-	subf    31,0,5
+	mtocrf	0x01,0
+	subf	cnt,0,cnt

 	/* Vector instructions work best when proper alignment (16-bytes)
 	is present.  Move 0~15 bytes as needed to get DST quadword-aligned.  */
-1:	/* Copy 1 byte.  */
+1:
 	bf	31,2f
-
-	lbz	6,0(12)
-	addi    12,12,1
-	stb	6,0(3)
-	addi    3,3,1
-2:	/* Copy 2 bytes.  */
+	lbz	6,0(src)
+	addi	src,src,1
+	stb	6,0(dst)
+	addi	dst,dst,1
+2:
 	bf	30,4f
-
-	lhz     6,0(12)
-	addi    12,12,2
-	sth     6,0(3)
-	addi    3,3,2
-4:	/* Copy 4 bytes.  */
+	lhz	6,0(src)
+	addi	src,src,2
+	sth	6,0(dst)
+	addi	dst,dst,2
+4:
 	bf	29,8f
-
-	lwz     6,0(12)
-	addi    12,12,4
-	stw     6,0(3)
-	addi    3,3,4
-8:	/* Copy 8 bytes.  */
+	lwz	6,0(src)
+	addi	src,src,4
+	stw	6,0(dst)
+	addi	dst,dst,4
+8:
 	bf	28,0f
-
-	ld	6,0(12)
-	addi    12,12,8
-	std	6,0(3)
-	addi    3,3,8
+	ld	6,0(src)
+	addi	src,src,8
+	std	6,0(dst)
+	addi	dst,dst,8
 0:
-	clrldi  10,12,60      /* Check alignment of SRC.  */
-	srdi    9,31,4	      /* Number of full quadwords remaining.  */
+	srdi	9,cnt,4	      /* Number of full quadwords remaining.  */

 	/* The proper alignment is present, it is OK to copy the bytes now.  */
 L(copy_GE_32_unaligned_cont):

 	/* Setup two indexes to speed up the indexed vector operations.  */
-	clrldi  11,31,60
-	li      6,16	      /* Index for 16-bytes offsets.  */
+	clrldi	10,cnt,60
+	li	6,16	      /* Index for 16-bytes offsets.  */
 	li	7,32	      /* Index for 32-bytes offsets.  */
-	cmpldi  cr1,11,0
-	srdi    8,31,5	      /* Setup the loop counter.  */
-	mr      10,3
-	mr      11,12
-	mtcrf   0x01,9
-	cmpldi  cr6,9,1
-	lvsl    5,0,12
-	lvx     3,0,12
-	bf      31,L(setup_unaligned_loop)
+	cmpldi	cr1,10,0
+	srdi	8,cnt,5	      /* Setup the loop counter.  */
+	mtocrf	0x01,9
+	cmpldi	cr6,9,1
+#ifdef __LITTLE_ENDIAN__
+	lvsr	5,0,src
+#else
+	lvsl	5,0,src
+#endif
+	lvx	3,0,src
+	li	0,0
+	bf	31,L(setup_unaligned_loop)

-	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
-	lvx     4,12,6
-	vperm   6,3,4,5
-	addi    11,12,16
-	addi    10,3,16
-	stvx    6,0,3
+	/* Copy another 16 bytes to align to 32-bytes due to the loop.  */
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	addi	src,src,16
+	stvx	6,0,dst
+	addi	dst,dst,16
 	vor	3,4,4
+	clrrdi	0,src,60

 L(setup_unaligned_loop):
-	mtctr   8
-	ble     cr6,L(end_unaligned_loop)
+	mtctr	8
+	ble	cr6,L(end_unaligned_loop)

 	/* Copy 32 bytes at a time using vector instructions.  */
-	.align  4
+	.align	4
 L(unaligned_loop):

 	/* Note: vr6/vr10 may contain data that was already copied,
@ -442,62 +385,55 @@ L(unaligned_loop):
 	some portions again. This is faster than having unaligned
 	vector instructions though.  */

-	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm   6,3,4,5	      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr6.  */
-	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm   10,4,3,5      /* Merge the correctly-aligned portions
-			      of vr3/vr4 into vr10.  */
-	addi    11,11,32
-	stvx    6,0,10
-	stvx    10,10,6
-	addi    10,10,32
-
+	lvx	4,src,6
+#ifdef __LITTLE_ENDIAN__
+	vperm	6,4,3,5
+#else
+	vperm	6,3,4,5
+#endif
+	lvx	3,src,7
+#ifdef __LITTLE_ENDIAN__
+	vperm	10,3,4,5
+#else
+	vperm	10,4,3,5
+#endif
+	addi	src,src,32
+	stvx	6,0,dst
+	stvx	10,dst,6
+	addi	dst,dst,32
 	bdnz	L(unaligned_loop)

-	.align  4
+	clrrdi	0,src,60
+
+	.align	4
 L(end_unaligned_loop):

 	/* Check for tail bytes.  */
-	rldicr  0,31,0,59
-	mtcrf   0x01,31
-	beq	cr1,0f
+	mtocrf	0x01,cnt
+	beqlr	cr1

-	add	3,3,0
-	add	12,12,0
+	add	src,src,0

 	/*  We have 1~15 tail bytes to copy, and DST is quadword aligned.  */
-8:	/* Copy 8 bytes.  */
+	/* Copy 8 bytes.  */
 	bf	28,4f
-
-	lwz	6,0(12)
-	lwz	7,4(12)
-	addi    12,12,8
-	stw	6,0(3)
-	stw	7,4(3)
-	addi    3,3,8
-4:	/* Copy 4 bytes.  */
-	bf	29,2f
-
-	lwz	6,0(12)
-	addi    12,12,4
-	stw	6,0(3)
-	addi    3,3,4
-2:	/* Copy 2~3 bytes.  */
-	bf	30,1f
-
-	lhz	6,0(12)
-	addi    12,12,2
-	sth	6,0(3)
-	addi    3,3,2
-1:	/* Copy 1 byte.  */
-	bf	31,0f
-
-	lbz	6,0(12)
-	stb	6,0(3)
-0:	/* Return original DST pointer.  */
-	ld	31,-8(1)
-	ld	3,-16(1)
+	lwz	6,0(src)
+	lwz	7,4(src)
+	addi	src,src,8
+	stw	6,0(dst)
+	stw	7,4(dst)
+	addi	dst,dst,8
+4:	/* Copy 4~7 bytes.  */
+	bf	29,L(tail2)
+	lwz	6,0(src)
+	stw	6,0(dst)
+	bf	30,L(tail5)
+	lhz	7,4(src)
+	sth	7,4(dst)
+	bflr	31
+	lbz	8,6(src)
+	stb	8,6(dst)
+	/* Return original DST pointer.  */
 	blr

 END_GEN_TB (memcpy,TB_TOCLESS)
--- a/sysdeps/powerpc/powerpc64/power7/mempcpy.S
+++ b/sysdeps/powerpc/powerpc64/power7/mempcpy.S
@ -365,13 +365,21 @@ L(copy_GE_32_unaligned_cont):
 	mr	11,12
 	mtcrf	0x01,9
 	cmpldi	cr6,9,1
-	lvsl	5,0,12
+#ifdef __LITTLE_ENDIAN__
+	lvsr    5,0,12
+#else
+	lvsl    5,0,12
+#endif
 	lvx	3,0,12
 	bf	31,L(setup_unaligned_loop)

 	/* Copy another 16 bytes to align to 32-bytes due to the loop .  */
 	lvx	4,12,6
-	vperm	6,3,4,5
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	addi	11,12,16
 	addi	10,3,16
 	stvx	6,0,3
@ -391,11 +399,17 @@ L(unaligned_loop):
 	vector instructions though.  */

 	lvx	4,11,6	      /* vr4 = r11+16.  */
-	vperm	6,3,4,5	      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr6.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   6,4,3,5
+#else
+	vperm   6,3,4,5
+#endif
 	lvx	3,11,7	      /* vr3 = r11+32.  */
-	vperm	10,4,3,5      /* Merge the correctly-aligned portions
-				 of vr3/vr4 into vr10.  */
+#ifdef __LITTLE_ENDIAN__
+	vperm   10,3,4,5
+#else
+	vperm   10,4,3,5
+#endif
 	addi	11,11,32
 	stvx	6,0,10
 	stvx	10,10,6