172 lines
3.4 KiB
ArmAsm
172 lines
3.4 KiB
ArmAsm
/* memcpy with unaliged loads
|
|
Copyright (C) 2013 Free Software Foundation, Inc.
|
|
This file is part of the GNU C Library.
|
|
|
|
The GNU C Library is free software; you can redistribute it and/or
|
|
modify it under the terms of the GNU Lesser General Public
|
|
License as published by the Free Software Foundation; either
|
|
version 2.1 of the License, or (at your option) any later version.
|
|
|
|
The GNU C Library is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
Lesser General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Lesser General Public
|
|
License along with the GNU C Library; if not, see
|
|
<http://www.gnu.org/licenses/>. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
#include "asm-syntax.h"
|
|
|
|
|
|
ENTRY(__memcpy_sse2_unaligned)
|
|
movq %rsi, %rax
|
|
leaq (%rdx,%rdx), %rcx
|
|
subq %rdi, %rax
|
|
subq %rdx, %rax
|
|
cmpq %rcx, %rax
|
|
jb L(overlapping)
|
|
cmpq $16, %rdx
|
|
jbe L(less_16)
|
|
movdqu (%rsi), %xmm8
|
|
cmpq $32, %rdx
|
|
movdqu %xmm8, (%rdi)
|
|
movdqu -16(%rsi,%rdx), %xmm8
|
|
movdqu %xmm8, -16(%rdi,%rdx)
|
|
ja .L31
|
|
L(return):
|
|
movq %rdi, %rax
|
|
ret
|
|
.p2align 4,,10
|
|
.p2align 4
|
|
.L31:
|
|
movdqu 16(%rsi), %xmm8
|
|
cmpq $64, %rdx
|
|
movdqu %xmm8, 16(%rdi)
|
|
movdqu -32(%rsi,%rdx), %xmm8
|
|
movdqu %xmm8, -32(%rdi,%rdx)
|
|
jbe L(return)
|
|
movdqu 32(%rsi), %xmm8
|
|
cmpq $128, %rdx
|
|
movdqu %xmm8, 32(%rdi)
|
|
movdqu -48(%rsi,%rdx), %xmm8
|
|
movdqu %xmm8, -48(%rdi,%rdx)
|
|
movdqu 48(%rsi), %xmm8
|
|
movdqu %xmm8, 48(%rdi)
|
|
movdqu -64(%rsi,%rdx), %xmm8
|
|
movdqu %xmm8, -64(%rdi,%rdx)
|
|
jbe L(return)
|
|
leaq 64(%rdi), %rcx
|
|
addq %rdi, %rdx
|
|
andq $-64, %rdx
|
|
andq $-64, %rcx
|
|
movq %rcx, %rax
|
|
subq %rdi, %rax
|
|
addq %rax, %rsi
|
|
cmpq %rdx, %rcx
|
|
je L(return)
|
|
movq %rsi, %r10
|
|
subq %rcx, %r10
|
|
leaq 16(%r10), %r9
|
|
leaq 32(%r10), %r8
|
|
leaq 48(%r10), %rax
|
|
.p2align 4,,10
|
|
.p2align 4
|
|
L(loop):
|
|
movdqu (%rcx,%r10), %xmm8
|
|
movdqa %xmm8, (%rcx)
|
|
movdqu (%rcx,%r9), %xmm8
|
|
movdqa %xmm8, 16(%rcx)
|
|
movdqu (%rcx,%r8), %xmm8
|
|
movdqa %xmm8, 32(%rcx)
|
|
movdqu (%rcx,%rax), %xmm8
|
|
movdqa %xmm8, 48(%rcx)
|
|
addq $64, %rcx
|
|
cmpq %rcx, %rdx
|
|
jne L(loop)
|
|
jmp L(return)
|
|
L(overlapping):
|
|
cmpq %rsi, %rdi
|
|
jae .L3
|
|
testq %rdx, %rdx
|
|
.p2align 4,,5
|
|
je L(return)
|
|
movq %rdx, %r9
|
|
leaq 16(%rsi), %rcx
|
|
leaq 16(%rdi), %r8
|
|
shrq $4, %r9
|
|
movq %r9, %rax
|
|
salq $4, %rax
|
|
cmpq %rcx, %rdi
|
|
setae %cl
|
|
cmpq %r8, %rsi
|
|
setae %r8b
|
|
orl %r8d, %ecx
|
|
cmpq $15, %rdx
|
|
seta %r8b
|
|
testb %r8b, %cl
|
|
je .L16
|
|
testq %rax, %rax
|
|
je .L16
|
|
xorl %ecx, %ecx
|
|
xorl %r8d, %r8d
|
|
.L7:
|
|
movdqu (%rsi,%rcx), %xmm8
|
|
addq $1, %r8
|
|
movdqu %xmm8, (%rdi,%rcx)
|
|
addq $16, %rcx
|
|
cmpq %r8, %r9
|
|
ja .L7
|
|
cmpq %rax, %rdx
|
|
je L(return)
|
|
.L21:
|
|
movzbl (%rsi,%rax), %ecx
|
|
movb %cl, (%rdi,%rax)
|
|
addq $1, %rax
|
|
cmpq %rax, %rdx
|
|
ja .L21
|
|
jmp L(return)
|
|
L(less_16):
|
|
testb $24, %dl
|
|
jne L(between_9_16)
|
|
testb $4, %dl
|
|
.p2align 4,,5
|
|
jne L(between_5_8)
|
|
testq %rdx, %rdx
|
|
.p2align 4,,2
|
|
je L(return)
|
|
movzbl (%rsi), %eax
|
|
testb $2, %dl
|
|
movb %al, (%rdi)
|
|
je L(return)
|
|
movzwl -2(%rsi,%rdx), %eax
|
|
movw %ax, -2(%rdi,%rdx)
|
|
jmp L(return)
|
|
.L3:
|
|
leaq -1(%rdx), %rax
|
|
.p2align 4,,10
|
|
.p2align 4
|
|
.L11:
|
|
movzbl (%rsi,%rax), %edx
|
|
movb %dl, (%rdi,%rax)
|
|
subq $1, %rax
|
|
jmp .L11
|
|
L(between_9_16):
|
|
movq (%rsi), %rax
|
|
movq %rax, (%rdi)
|
|
movq -8(%rsi,%rdx), %rax
|
|
movq %rax, -8(%rdi,%rdx)
|
|
jmp L(return)
|
|
.L16:
|
|
xorl %eax, %eax
|
|
jmp .L21
|
|
L(between_5_8):
|
|
movl (%rsi), %eax
|
|
movl %eax, (%rdi)
|
|
movl -4(%rsi,%rdx), %eax
|
|
movl %eax, -4(%rdi,%rdx)
|
|
jmp L(return)
|
|
END(__memcpy_sse2_unaligned)
|