diff memcpy.S @ 0:c55ea9478c80

Hello Gensokyo!
author Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
date Tue, 21 May 2013 10:29:21 +0200
parents
children
line wrap: on
line diff
new file mode 100644
--- /dev/null
+++ b/memcpy.S
@@ -0,0 +1,62 @@
+        .text
+.global memcpy
+.type memcpy,%function
+memcpy:
+    /* Cutoff for the big loop is a size of 64 bytes since otherwise
+       the loop will never be entered.  */
+    cmpq    $64, %rdx
+    movq    %rdx, %rcx
+
+    /* We need this in any case.  */
+    cld
+    jbe 1f
+
+    /* Align destination.  */
+    movq    %rdi, %rax
+    negq    %rax
+    andq    $15, %rax
+    subq    %rax, %rcx
+    xchgq   %rax, %rcx
+
+    rep; movsb
+
+    movq    %rax, %rcx
+    subq    $64, %rcx
+    js  2f
+
+    /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
+    .p2align 4,,11
+3:
+    /* Now correct the loop counter.  Please note that in the following
+       code the flags are not changed anymore.  */
+    subq    $64, %rcx
+
+    movups    (%rsi), %xmm0
+    movups  16(%rsi), %xmm1
+    movups  32(%rsi), %xmm2
+    movups  48(%rsi), %xmm3
+    movaps     %xmm0,   (%rdi)
+    movaps     %xmm1, 16(%rdi)
+    movaps     %xmm2, 32(%rdi)
+    movaps     %xmm3, 48(%rdi)
+
+    leaq    64(%rsi), %rsi
+    leaq    64(%rdi), %rdi
+
+    jns 3b
+
+    /* Correct extra loop counter modification.  */
+2:  addq    $64, %rcx
+1:  
+    movq    %rcx, %r10
+    shr     $2, %rcx
+    rep; movsd
+
+    movq    %r10, %rcx
+    andq    $3, %rcx
+    rep; movsb
+
+    movq    %rdi, %rax      /* Set return value.  */
+    ret
+.size memcpy,.-memcpy
+