view memcpy.S @ 0:c55ea9478c80

Hello Gensokyo!
author Emmanuel Gil Peyrot <linkmauve@linkmauve.fr>
date Tue, 21 May 2013 10:29:21 +0200
parents
children
line wrap: on
line source

        .text
.global memcpy
.type memcpy,%function
memcpy:
    /* Cutoff for the big loop is a size of 64 bytes since otherwise
       the loop will never be entered.  */
    cmpq    $64, %rdx
    movq    %rdx, %rcx

    /* We need this in any case.  */
    cld
    jbe 1f

    /* Align destination.  */
    movq    %rdi, %rax
    negq    %rax
    andq    $15, %rax
    subq    %rax, %rcx
    xchgq   %rax, %rcx

    rep; movsb

    movq    %rax, %rcx
    subq    $64, %rcx
    js  2f

    /* Next 3 insns are 11 bytes total, make sure we decode them in one go */
    .p2align 4,,11
3:
    /* Now correct the loop counter.  Please note that in the following
       code the flags are not changed anymore.  */
    subq    $64, %rcx

    movups    (%rsi), %xmm0
    movups  16(%rsi), %xmm1
    movups  32(%rsi), %xmm2
    movups  48(%rsi), %xmm3
    movaps     %xmm0,   (%rdi)
    movaps     %xmm1, 16(%rdi)
    movaps     %xmm2, 32(%rdi)
    movaps     %xmm3, 48(%rdi)

    leaq    64(%rsi), %rsi
    leaq    64(%rdi), %rdi

    jns 3b

    /* Correct extra loop counter modification.  */
2:  addq    $64, %rcx
1:  
    movq    %rcx, %r10
    shr     $2, %rcx
    rep; movsd

    movq    %r10, %rcx
    andq    $3, %rcx
    rep; movsb

    movq    %rdi, %rax      /* Set return value.  */
    ret
.size memcpy,.-memcpy