1 |
I've been thinking about CFLAGS such as -mmmx... Does gcc actually have the |
2 |
ability to actually produce mmx instructions? |
3 |
|
4 |
|
5 |
for example: |
6 |
|
7 |
|
8 |
mmx.c: |
9 |
|
10 |
#include <stdio.h> |
11 |
|
12 |
|
13 |
main() |
14 |
{ |
15 |
short x[100000]; |
16 |
int i; |
17 |
|
18 |
for (i=0; i < 100000; i++) |
19 |
{ |
20 |
x[i] += 5; |
21 |
|
22 |
} |
23 |
|
24 |
|
25 |
} |
26 |
|
27 |
gcc -mmmx -O3 --save-temps -funroll-all-loops mmx.c |
28 |
gives |
29 |
|
30 |
|
31 |
.file "mmx.c" |
32 |
.def ___main; .scl 2; .type 32; .endef |
33 |
.text |
34 |
.align 2 |
35 |
.p2align 4,,15 |
36 |
.globl _main |
37 |
.def _main; .scl 2; .type 32; .endef |
38 |
_main: |
39 |
pushl %ebp |
40 |
movl $200008, %eax |
41 |
movl %esp, %ebp |
42 |
call __alloca |
43 |
xorl %eax, %eax |
44 |
andl $-16, %esp |
45 |
call __alloca |
46 |
call ___main |
47 |
xorl %eax, %eax |
48 |
.p2align 4,,7 |
49 |
L6: |
50 |
addw $5, -200008(%ebp,%eax,2) |
51 |
addw $5, -200006(%ebp,%eax,2) |
52 |
addw $5, -200004(%ebp,%eax,2) |
53 |
addw $5, -200002(%ebp,%eax,2) |
54 |
addw $5, -200000(%ebp,%eax,2) |
55 |
addw $5, -199998(%ebp,%eax,2) |
56 |
addw $5, -199996(%ebp,%eax,2) |
57 |
addw $5, -199994(%ebp,%eax,2) |
58 |
addw $5, -199992(%ebp,%eax,2) |
59 |
addw $5, -199990(%ebp,%eax,2) |
60 |
addl $10, %eax |
61 |
cmpl $99999, %eax |
62 |
jle L6 |
63 |
leave |
64 |
ret |
65 |
|
66 |
|
67 |
|
68 |
|
69 |
Shouldn't take too much imagination to see how that could be improved with mmx... |
70 |
|
71 |
Likewise: |
72 |
|
73 |
|
74 |
|
75 |
fu(int *x) |
76 |
{ |
77 |
if (*x == 5) |
78 |
*x = 10; |
79 |
} |
80 |
|
81 |
|
82 |
|
83 |
with gcc -march=athlon-xp -O3 --save-temps -funroll-all-loops mmx.c |
84 |
|
85 |
gives |
86 |
|
87 |
|
88 |
|
89 |
_fu: |
90 |
pushl %ebp |
91 |
movl %esp, %ebp |
92 |
movl 8(%ebp), %eax |
93 |
cmpl $5, (%eax) |
94 |
je L33 |
95 |
L32: |
96 |
leave |
97 |
ret |
98 |
.p2align 6,,7 |
99 |
L33: |
100 |
movl $10, (%eax) |
101 |
jmp L32 |
102 |
|
103 |
|
104 |
the -march=athlon-xp parameter should imply cmov instruction...(?) |
105 |
|
106 |
|
107 |
(it doesn't even use muliple leave/ret to eliminate an extra jmp...) |
108 |
|
109 |
|
110 |
|
111 |
Do the gcc developers include these fancy parameters just to give us ricers a |
112 |
false sense of speed? |
113 |
|
114 |
|
115 |
any thoughts? |
116 |
|
117 |
|
118 |
|
119 |
-- |
120 |
gentoo-performance@g.o mailing list |