Update x86-64 mpn routines from GMP 5.0.1.

This commit is contained in:
Ulrich Drepper 2010-09-02 23:36:25 -07:00
parent ece2984070
commit 0959ffc97b
8 changed files with 482 additions and 187 deletions

View File

@ -1,3 +1,13 @@
2010-09-02 Ulrich Drepper <drepper@redhat.com>
* sysdeps/x86_64/add_n.S: Update from GMP 5.0.1.
* sysdeps/x86_64/addmul_1.S: Likewise.
* sysdeps/x86_64/lshift.S: Likewise.
* sysdeps/x86_64/mul_1.S: Likewise.
* sysdeps/x86_64/rshift.S: Likewise.
* sysdeps/x86_64/sub_n.S: Likewise.
* sysdeps/x86_64/submul_1.S: Likewise.
2010-09-01 Samuel Thibault <samuel.thibault@ens-lyon.org>
This aligns bits/sched.h onto sysdeps/unix/sysv/linux/bits/sched.h:

View File

@ -1,6 +1,6 @@
/* Add two limb vectors of the same length > 0 and store sum in a third
limb vector.
Copyright (C) 2004 Free Software Foundation, Inc.
/* x86-64 __mpn_add_n -- Add two limb vectors of the same length > 0 and store
sum in a third limb vector.
Copyright (C) 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -21,22 +21,81 @@
#include "sysdep.h"
#include "asm-syntax.h"
#define rp %rdi
#define up %rsi
#define vp %rdx
#define n %rcx
#define cy %r8
#ifndef func
# define func __mpn_add_n
# define ADCSBB adc
#endif
.text
ENTRY (__mpn_add_n)
leaq (%rsi,%rcx,8), %rsi
leaq (%rdi,%rcx,8), %rdi
leaq (%rdx,%rcx,8), %rdx
negq %rcx
xorl %eax, %eax # clear cy
.p2align 2
L(loop):
movq (%rsi,%rcx,8), %rax
movq (%rdx,%rcx,8), %r10
adcq %r10, %rax
movq %rax, (%rdi,%rcx,8)
incq %rcx
jne L(loop)
movq %rcx, %rax # zero %rax
adcq %rax, %rax
ENTRY (func)
xor %r8, %r8
mov (up), %r10
mov (vp), %r11
lea -8(up,n,8), up
lea -8(vp,n,8), vp
lea -16(rp,n,8), rp
mov %ecx, %eax
neg n
and $3, %eax
je L(b00)
add %rax, n /* clear low rcx bits for jrcxz */
cmp $2, %eax
jl L(b01)
je L(b10)
L(b11): shr %r8 /* set cy */
jmp L(e11)
L(b00): shr %r8 /* set cy */
mov %r10, %r8
mov %r11, %r9
lea 4(n), n
jmp L(e00)
L(b01): shr %r8 /* set cy */
jmp L(e01)
L(b10): shr %r8 /* set cy */
mov %r10, %r8
mov %r11, %r9
jmp L(e10)
L(end): ADCSBB %r11, %r10
mov %r10, 8(rp)
mov %ecx, %eax /* clear eax, ecx contains 0 */
adc %eax, %eax
ret
END (__mpn_add_n)
.p2align 4
L(top):
mov -24(up,n,8), %r8
mov -24(vp,n,8), %r9
ADCSBB %r11, %r10
mov %r10, -24(rp,n,8)
L(e00):
mov -16(up,n,8), %r10
mov -16(vp,n,8), %r11
ADCSBB %r9, %r8
mov %r8, -16(rp,n,8)
L(e11):
mov -8(up,n,8), %r8
mov -8(vp,n,8), %r9
ADCSBB %r11, %r10
mov %r10, -8(rp,n,8)
L(e10):
mov (up,n,8), %r10
mov (vp,n,8), %r11
ADCSBB %r9, %r8
mov %r8, (rp,n,8)
L(e01):
jrcxz L(end)
lea 4(n), n
jmp L(top)
END (func)

View File

@ -1,6 +1,6 @@
/* AMD64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
/* x86-64 __mpn_addmul_1 -- Multiply a limb vector with a limb and add
the result to a second limb vector.
Copyright (C) 2004 Free Software Foundation, Inc.
Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -21,26 +21,95 @@
#include "sysdep.h"
#include "asm-syntax.h"
#define rp %rdi
#define up %rsi
#define n %rdx
#define v0 %rcx
#ifndef func
# define func __mpn_addmul_1
# define ADDSUB add
#endif
.text
ENTRY (__mpn_addmul_1)
movq %rdx, %r11
leaq (%rsi,%rdx,8), %rsi
leaq (%rdi,%rdx,8), %rdi
negq %r11
xorl %r8d, %r8d
xorl %r10d, %r10d
.p2align 2
L(loop):
movq (%rsi,%r11,8), %rax
mulq %rcx
addq (%rdi,%r11,8), %rax
adcq %r10, %rdx
addq %r8, %rax
movq %r10, %r8
movq %rax, (%rdi,%r11,8)
adcq %rdx, %r8
incq %r11
jne L(loop)
movq %r8, %rax
ENTRY (func)
push %rbx
push %rbp
lea (%rdx), %rbx
neg %rbx
mov (up), %rax
mov (rp), %r10
lea -16(rp,%rdx,8), rp
lea (up,%rdx,8), up
mul %rcx
bt $0, %ebx
jc L(odd)
lea (%rax), %r11
mov 8(up,%rbx,8), %rax
lea (%rdx), %rbp
mul %rcx
add $2, %rbx
jns L(n2)
lea (%rax), %r8
mov (up,%rbx,8), %rax
lea (%rdx), %r9
jmp L(mid)
L(odd): add $1, %rbx
jns L(n1)
lea (%rax), %r8
mov (up,%rbx,8), %rax
lea (%rdx), %r9
mul %rcx
lea (%rax), %r11
mov 8(up,%rbx,8), %rax
lea (%rdx), %rbp
jmp L(e)
.p2align 4
L(top): mul %rcx
ADDSUB %r8, %r10
lea (%rax), %r8
mov (up,%rbx,8), %rax
adc %r9, %r11
mov %r10, -8(rp,%rbx,8)
mov (rp,%rbx,8), %r10
lea (%rdx), %r9
adc $0, %rbp
L(mid): mul %rcx
ADDSUB %r11, %r10
lea (%rax), %r11
mov 8(up,%rbx,8), %rax
adc %rbp, %r8
mov %r10, (rp,%rbx,8)
mov 8(rp,%rbx,8), %r10
lea (%rdx), %rbp
adc $0, %r9
L(e): add $2, %rbx
js L(top)
mul %rcx
ADDSUB %r8, %r10
adc %r9, %r11
mov %r10, -8(rp)
adc $0, %rbp
L(n2): mov (rp), %r10
ADDSUB %r11, %r10
adc %rbp, %rax
mov %r10, (rp)
adc $0, %rdx
L(n1): mov 8(rp), %r10
ADDSUB %rax, %r10
mov %r10, 8(rp)
mov %ebx, %eax /* zero rax */
adc %rdx, %rax
pop %rbp
pop %rbx
ret
END (__mpn_addmul_1)
END (func)

View File

@ -1,5 +1,5 @@
/* AMD64 __mpn_lshift --
Copyright 2004, 2006 Free Software Foundation, Inc.
/* x86-64 __mpn_lshift --
Copyright (C) 2007, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -20,41 +20,98 @@
#include "sysdep.h"
#include "asm-syntax.h"
#define rp %rdi
#define up %rsi
#define n %rdx
#define cnt %cl
.text
ENTRY (__mpn_lshift)
movq -8(%rsi,%rdx,8), %mm7
movd %ecx, %mm1
movl $64, %eax
subl %ecx, %eax
movd %eax, %mm0
movq %mm7, %mm3
psrlq %mm0, %mm7
movd %mm7, %rax
subq $2, %rdx
jl L(endo)
.p2align 2
L(loop):
movq (%rsi,%rdx,8), %mm6
movq %mm6, %mm2
psrlq %mm0, %mm6
psllq %mm1, %mm3
por %mm6, %mm3
movq %mm3, 8(%rdi,%rdx,8)
je L(ende)
movq -8(%rsi,%rdx,8), %mm7
movq %mm7, %mm3
psrlq %mm0, %mm7
psllq %mm1, %mm2
por %mm7, %mm2
movq %mm2, (%rdi,%rdx,8)
subq $2, %rdx
jge L(loop)
L(endo):
movq %mm3, %mm2
L(ende):
psllq %mm1, %mm2
movq %mm2, (%rdi)
emms
lea -8(rp,n,8), rp
lea -8(up,n,8), up
mov %edx, %eax
and $3, %eax
jne L(nb00)
L(b00): /* n = 4, 8, 12, ... */
mov (up), %r10
mov -8(up), %r11
xor %eax, %eax
shld %cl, %r10, %rax
mov -16(up), %r8
lea 24(rp), rp
sub $4, n
jmp L(00)
L(nb00):/* n = 1, 5, 9, ... */
cmp $2, %eax
jae L(nb01)
L(b01): mov (up), %r9
xor %eax, %eax
shld %cl, %r9, %rax
sub $2, n
jb L(le1)
mov -8(up), %r10
mov -16(up), %r11
lea -8(up), up
lea 16(rp), rp
jmp L(01)
L(le1): shl %cl, %r9
mov %r9, (rp)
ret
L(nb01):/* n = 2, 6, 10, ... */
jne L(b11)
L(b10): mov (up), %r8
mov -8(up), %r9
xor %eax, %eax
shld %cl, %r8, %rax
sub $3, n
jb L(le2)
mov -16(up), %r10
lea -16(up), up
lea 8(rp), rp
jmp L(10)
L(le2): shld %cl, %r9, %r8
mov %r8, (rp)
shl %cl, %r9
mov %r9, -8(rp)
ret
.p2align 4 /* performance critical! */
L(b11): /* n = 3, 7, 11, ... */
mov (up), %r11
mov -8(up), %r8
xor %eax, %eax
shld %cl, %r11, %rax
mov -16(up), %r9
lea -24(up), up
sub $4, n
jb L(end)
.p2align 4
L(top): shld %cl, %r8, %r11
mov (up), %r10
mov %r11, (rp)
L(10): shld %cl, %r9, %r8
mov -8(up), %r11
mov %r8, -8(rp)
L(01): shld %cl, %r10, %r9
mov -16(up), %r8
mov %r9, -16(rp)
L(00): shld %cl, %r11, %r10
mov -24(up), %r9
mov %r10, -24(rp)
add $-32, up
lea -32(rp), rp
sub $4, n
jnc L(top)
L(end): shld %cl, %r8, %r11
mov %r11, (rp)
shld %cl, %r9, %r8
mov %r8, -8(rp)
shl %cl, %r9
mov %r9, -16(rp)
ret
END (__mpn_lshift)

View File

@ -1,6 +1,6 @@
/* AMD64 __mpn_mul_1 -- Multiply a limb vector with a limb and store
the result in a second limb vector.
Copyright (C) 2004 Free Software Foundation, Inc.
Copyright (C) 2003, 2004, 2005, 2007, 2008 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -21,22 +21,109 @@
#include <sysdep.h>
#include "asm-syntax.h"
#define rp %rdi
#define up %rsi
#define n_param %rdx
#define vl %rcx
#define n %r11
.text
ENTRY (__mpn_mul_1)
movq %rdx, %r11
leaq (%rsi,%rdx,8), %rsi
leaq (%rdi,%rdx,8), %rdi
negq %r11
xorl %r8d, %r8d
L(loop):
movq (%rsi,%r11,8), %rax
mulq %rcx
addq %r8, %rax
movl $0, %r8d
adcq %rdx, %r8
movq %rax, (%rdi,%r11,8)
incq %r11
jne L(loop)
movq %r8, %rax
push %rbx
cfi_adjust_cfa_offset (8)
cfi_rel_offset (%rbx, 0)
xor %r10, %r10
mov (up), %rax /* read first u limb early */
mov n_param, %rbx /* move away n from rdx, mul uses it */
mul vl
mov %rbx, %r11
add %r10, %rax
adc $0, %rdx
and $3, %ebx
jz L(b0)
cmp $2, %ebx
jz L(b2)
jg L(b3)
L(b1): dec n
jne L(gt1)
mov %rax, (rp)
jmp L(ret)
L(gt1): lea 8(up,n,8), up
lea -8(rp,n,8), rp
neg n
xor %r10, %r10
xor %ebx, %ebx
mov %rax, %r9
mov (up,n,8), %rax
mov %rdx, %r8
jmp L(L1)
L(b0): lea (up,n,8), up
lea -16(rp,n,8), rp
neg n
xor %r10, %r10
mov %rax, %r8
mov %rdx, %rbx
jmp L(L0)
L(b3): lea -8(up,n,8), up
lea -24(rp,n,8), rp
neg n
mov %rax, %rbx
mov %rdx, %r10
jmp L(L3)
L(b2): lea -16(up,n,8), up
lea -32(rp,n,8), rp
neg n
xor %r8, %r8
xor %ebx, %ebx
mov %rax, %r10
mov 24(up,n,8), %rax
mov %rdx, %r9
jmp L(L2)
.p2align 4
L(top): mov %r10, (rp,n,8)
add %rax, %r9
mov (up,n,8), %rax
adc %rdx, %r8
mov $0, %r10d
L(L1): mul vl
mov %r9, 8(rp,n,8)
add %rax, %r8
adc %rdx, %rbx
L(L0): mov 8(up,n,8), %rax
mul vl
mov %r8, 16(rp,n,8)
add %rax, %rbx
adc %rdx, %r10
L(L3): mov 16(up,n,8), %rax
mul vl
mov %rbx, 24(rp,n,8)
mov $0, %r8d # zero
mov %r8, %rbx # zero
add %rax, %r10
mov 24(up,n,8), %rax
mov %r8, %r9 # zero
adc %rdx, %r9
L(L2): mul vl
add $4, n
js L(top)
mov %r10, (rp,n,8)
add %rax, %r9
adc %r8, %rdx
mov %r9, 8(rp,n,8)
add %r8, %rdx
L(ret): mov %rdx, %rax
pop %rbx
cfi_adjust_cfa_offset (-8)
cfi_restore (%rbx)
ret
END (__mpn_mul_1)

View File

@ -1,5 +1,5 @@
/* AMD64 __mpn_rshift --
Copyright (C) 2004, 2006 Free Software Foundation, Inc.
/* x86-64 __mpn_rshift --
Copyright (C) 2007, 2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -20,43 +20,96 @@
#include "sysdep.h"
#include "asm-syntax.h"
#define rp %rdi
#define up %rsi
#define n %rdx
#define cnt %cl
.text
ENTRY (__mpn_rshift)
movq (%rsi), %mm7
movd %ecx, %mm1
movl $64, %eax
subl %ecx, %eax
movd %eax, %mm0
movq %mm7, %mm3
psllq %mm0, %mm7
movd %mm7, %rax
leaq (%rsi,%rdx,8), %rsi
leaq (%rdi,%rdx,8), %rdi
negq %rdx
addq $2, %rdx
jg L(endo)
.p2align 2
L(loop):
movq -8(%rsi,%rdx,8), %mm6
movq %mm6, %mm2
psllq %mm0, %mm6
psrlq %mm1, %mm3
por %mm6, %mm3
movq %mm3, -16(%rdi,%rdx,8)
je L(ende)
movq (%rsi,%rdx,8), %mm7
movq %mm7, %mm3
psllq %mm0, %mm7
psrlq %mm1, %mm2
por %mm7, %mm2
movq %mm2, -8(%rdi,%rdx,8)
addq $2, %rdx
jle L(loop)
L(endo):
movq %mm3, %mm2
L(ende):
psrlq %mm1, %mm2
movq %mm2, -8(%rdi)
emms
mov %edx, %eax
and $3, %eax
jne L(nb00)
L(b00): /* n = 4, 8, 12, ... */
mov (up), %r10
mov 8(up), %r11
xor %eax, %eax
shrd %cl, %r10, %rax
mov 16(up), %r8
lea 8(up), up
lea -24(rp), rp
sub $4, n
jmp L(00)
L(nb00):/* n = 1, 5, 9, ... */
cmp $2, %eax
jae L(nb01)
L(b01): mov (up), %r9
xor %eax, %eax
shrd %cl, %r9, %rax
sub $2, n
jb L(le1)
mov 8(up), %r10
mov 16(up), %r11
lea 16(up), up
lea -16(rp), rp
jmp L(01)
L(le1): shr %cl, %r9
mov %r9, (rp)
ret
L(nb01):/* n = 2, 6, 10, ... */
jne L(b11)
L(b10): mov (up), %r8
mov 8(up), %r9
xor %eax, %eax
shrd %cl, %r8, %rax
sub $3, n
jb L(le2)
mov 16(up), %r10
lea 24(up), up
lea -8(rp), rp
jmp L(10)
L(le2): shrd %cl, %r9, %r8
mov %r8, (rp)
shr %cl, %r9
mov %r9, 8(rp)
ret
.p2align 4
L(b11): /* n = 3, 7, 11, ... */
mov (up), %r11
mov 8(up), %r8
xor %eax, %eax
shrd %cl, %r11, %rax
mov 16(up), %r9
lea 32(up), up
sub $4, n
jb L(end)
.p2align 4
L(top): shrd %cl, %r8, %r11
mov -8(up), %r10
mov %r11, (rp)
L(10): shrd %cl, %r9, %r8
mov (up), %r11
mov %r8, 8(rp)
L(01): shrd %cl, %r10, %r9
mov 8(up), %r8
mov %r9, 16(rp)
L(00): shrd %cl, %r11, %r10
mov 16(up), %r9
mov %r10, 24(rp)
add $32, up
lea 32(rp), rp
sub $4, n
jnc L(top)
L(end): shrd %cl, %r8, %r11
mov %r11, (rp)
shrd %cl, %r9, %r8
mov %r8, 8(rp)
shr %cl, %r9
mov %r9, 16(rp)
ret
END (__mpn_rshift)

View File

@ -1,6 +1,6 @@
/* AMD64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
/* x86-64 __mpn_sub_n -- Add two limb vectors of the same length > 0 and store
sum in a third limb vector.
Copyright (C) 2004 Free Software Foundation, Inc.
Copyright (C) 2006, 2007 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -18,25 +18,7 @@
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA. */
#include "sysdep.h"
#include "asm-syntax.h"
#define func __mpn_sub_n
#define ADCSBB sbb
.text
ENTRY (__mpn_sub_n)
leaq (%rsi,%rcx,8), %rsi
leaq (%rdi,%rcx,8), %rdi
leaq (%rdx,%rcx,8), %rdx
negq %rcx
xorl %eax, %eax # clear cy
.p2align 2
L(loop):
movq (%rsi,%rcx,8), %rax
movq (%rdx,%rcx,8), %r10
sbbq %r10, %rax
movq %rax, (%rdi,%rcx,8)
incq %rcx
jne L(loop)
movq %rcx, %rax # zero %rax
adcq %rax, %rax
ret
END (__mpn_sub_n)
#include "add_n.S"

View File

@ -1,6 +1,6 @@
/* AMD64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
/* x86-64 __mpn_submul_1 -- Multiply a limb vector with a limb and subtract
the result from a second limb vector.
Copyright (C) 2004 Free Software Foundation, Inc.
Copyright (C) 2003,2004,2005,2007,2008,2009 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
The GNU MP Library is free software; you can redistribute it and/or modify
@ -18,29 +18,7 @@
the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston,
MA 02111-1307, USA. */
#include "sysdep.h"
#include "asm-syntax.h"
#define func __mpn_submul_1
#define ADDSUB sub
.text
ENTRY (__mpn_submul_1)
movq %rdx, %r11
leaq (%rsi,%r11,8), %rsi
leaq (%rdi,%r11,8), %rdi
negq %r11
xorl %r8d, %r8d
.p2align 3
L(loop):
movq (%rsi,%r11,8), %rax
movq (%rdi,%r11,8), %r10
mulq %rcx
subq %r8, %r10
movl $0, %r8d
adcl %r8d, %r8d
subq %rax, %r10
adcq %rdx, %r8
movq %r10, (%rdi,%r11,8)
incq %r11
jne L(loop)
movq %r8, %rax
ret
END (__mpn_submul_1)
#include "addmul_1.S"