Mercurial > pmdwin
comparison memcpy.S @ 0:c55ea9478c80
Hello Gensokyo!
author | Emmanuel Gil Peyrot <linkmauve@linkmauve.fr> |
---|---|
date | Tue, 21 May 2013 10:29:21 +0200 |
parents | |
children |
comparison
equal
deleted
inserted
replaced
-1:000000000000 | 0:c55ea9478c80 |
---|---|
1 .text | |
2 .global memcpy | |
3 .type memcpy,%function | |
4 memcpy: | |
5 /* Cutoff for the big loop is a size of 64 bytes since otherwise | |
6 the loop will never be entered. */ | |
7 cmpq $64, %rdx | |
8 movq %rdx, %rcx | |
9 | |
10 /* We need this in any case. */ | |
11 cld | |
12 jbe 1f | |
13 | |
14 /* Align destination. */ | |
15 movq %rdi, %rax | |
16 negq %rax | |
17 andq $15, %rax | |
18 subq %rax, %rcx | |
19 xchgq %rax, %rcx | |
20 | |
21 rep; movsb | |
22 | |
23 movq %rax, %rcx | |
24 subq $64, %rcx | |
25 js 2f | |
26 | |
27 /* Next 3 insns are 11 bytes total, make sure we decode them in one go */ | |
28 .p2align 4,,11 | |
29 3: | |
30 /* Now correct the loop counter. Please note that in the following | |
31 code the flags are not changed anymore. */ | |
32 subq $64, %rcx | |
33 | |
34 movups (%rsi), %xmm0 | |
35 movups 16(%rsi), %xmm1 | |
36 movups 32(%rsi), %xmm2 | |
37 movups 48(%rsi), %xmm3 | |
38 movaps %xmm0, (%rdi) | |
39 movaps %xmm1, 16(%rdi) | |
40 movaps %xmm2, 32(%rdi) | |
41 movaps %xmm3, 48(%rdi) | |
42 | |
43 leaq 64(%rsi), %rsi | |
44 leaq 64(%rdi), %rdi | |
45 | |
46 jns 3b | |
47 | |
48 /* Correct extra loop counter modification. */ | |
49 2: addq $64, %rcx | |
50 1: | |
51 movq %rcx, %r10 | |
52 shr $2, %rcx | |
53 rep; movsd | |
54 | |
55 movq %r10, %rcx | |
56 andq $3, %rcx | |
57 rep; movsb | |
58 | |
59 movq %rdi, %rax /* Set return value. */ | |
60 ret | |
61 .size memcpy,.-memcpy | |
62 |