Mercurial > pmdwin
diff memcpy.S @ 0:c55ea9478c80
Hello Gensokyo!
author | Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> |
---|---|
date | Tue, 21 May 2013 10:29:21 +0200 |
parents | |
children |
line wrap: on
line diff
new file mode 100644 --- /dev/null +++ b/memcpy.S @@ -0,0 +1,62 @@ + .text +.global memcpy +.type memcpy,%function +memcpy: + /* Cutoff for the big loop is a size of 64 bytes since otherwise + the loop will never be entered. */ + cmpq $64, %rdx + movq %rdx, %rcx + + /* We need this in any case. */ + cld + jbe 1f + + /* Align destination. */ + movq %rdi, %rax + negq %rax + andq $15, %rax + subq %rax, %rcx + xchgq %rax, %rcx + + rep; movsb + + movq %rax, %rcx + subq $64, %rcx + js 2f + + /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ + .p2align 4,,11 +3: + /* Now correct the loop counter. Please note that in the following + code the flags are not changed anymore. */ + subq $64, %rcx + + movups (%rsi), %xmm0 + movups 16(%rsi), %xmm1 + movups 32(%rsi), %xmm2 + movups 48(%rsi), %xmm3 + movaps %xmm0, (%rdi) + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + + leaq 64(%rsi), %rsi + leaq 64(%rdi), %rdi + + jns 3b + + /* Correct extra loop counter modification. */ +2: addq $64, %rcx +1: + movq %rcx, %r10 + shr $2, %rcx + rep; movsd + + movq %r10, %rcx + andq $3, %rcx + rep; movsb + + movq %rdi, %rax /* Set return value. */ + ret +.size memcpy,.-memcpy +