//===---------------------------------------------------------------------===// // Random ideas for the X86 backend: MMX-specific stuff. //===---------------------------------------------------------------------===// //===---------------------------------------------------------------------===// This: #include <mmintrin.h> __v2si qux(int A) { return (__v2si){ 0, A }; } is compiled into: _qux: subl $28, %esp movl 32(%esp), %eax movd %eax, %mm0 movq %mm0, (%esp) movl (%esp), %eax movl %eax, 20(%esp) movq %mm0, 8(%esp) movl 12(%esp), %eax movl %eax, 16(%esp) movq 16(%esp), %mm0 addl $28, %esp ret Yuck! GCC gives us: _qux: subl $12, %esp movl 16(%esp), %eax movl 20(%esp), %edx movl $0, (%eax) movl %edx, 4(%eax) addl $12, %esp ret $4 //===---------------------------------------------------------------------===// We generate crappy code for this: __m64 t() { return _mm_cvtsi32_si64(1); } _t: subl $12, %esp movl $1, %eax movd %eax, %mm0 movq %mm0, (%esp) movl (%esp), %eax movl 4(%esp), %edx addl $12, %esp ret The extra stack traffic is covered in the previous entry. But the other reason is we are not smart about materializing constants in MMX registers. With -m64 movl $1, %eax movd %eax, %mm0 movd %mm0, %rax ret We should be using a constantpool load instead: movq LC0(%rip), %rax